[med-svn] [iqtree] 01/05: New upstream version 1.5.2+dfsg

Kevin Murray daube-guest at moszumanska.debian.org
Sun Dec 4 01:29:03 UTC 2016


This is an automated email from the git hooks/post-receive script.

daube-guest pushed a commit to branch master
in repository iqtree.

commit 80d358298be5121dd976ba4e053538dd27f18f3c
Author: Kevin Murray <kdmfoss at gmail.com>
Date:   Sun Dec 4 11:25:31 2016 +1100

    New upstream version 1.5.2+dfsg
---
 CMakeLists.txt                     |  416 ++++--
 MPIHelper.cpp                      |  560 +++++++
 MPIHelper.h                        |  305 ++++
 ObjectStream.cpp                   |  113 ++
 ObjectStream.h                     |   82 ++
 README.md                          |   71 +-
 TreeCollection.cpp                 |   56 +
 TreeCollection.h                   |   63 +
 alignment.cpp                      |   21 +-
 alignment.h                        |    7 +-
 candidateset.cpp                   |  584 +++++---
 candidateset.h                     |  260 ++--
 checkpoint.cpp                     |  120 +-
 checkpoint.h                       |   20 +-
 constrainttree.cpp                 |  211 +++
 constrainttree.h                   |   81 ++
 example/example.nex                |    6 +-
 example/example.phy                |   64 +-
 gsl/gauss.cpp                      |   12 +
 hashsplitset.h                     |   19 +-
 iqtree.cpp                         | 2064 +++++++++++++++++---------
 iqtree.h                           |  378 +++--
 iqtree_config.h.in                 |    3 +
 lpwrapper.c                        |    2 +-
 memslot.cpp                        |  254 ++++
 memslot.h                          |  111 ++
 mexttree.cpp                       |   73 +-
 mexttree.h                         |   10 +
 model/modelcodon.cpp               |    8 +
 model/modelfactory.cpp             |   34 +-
 model/modelmixture.cpp             |   26 +-
 model/modelmixture.h               |   11 +
 model/modelpomo.cpp                |    3 +-
 model/modelset.cpp                 |  136 +-
 model/modelset.h                   |    7 +-
 model/modelsubst.h                 |    6 +
 model/partitionmodel.cpp           |    6 +-
 model/ratefree.cpp                 |   23 +-
 model/rategamma.cpp                |    8 +-
 model/rategamma.h                  |    8 +-
 model/rategammainvar.cpp           |    3 +-
 model/rateheterogeneity.h          |   12 +-
 model/rateinvar.cpp                |   11 +-
 model/rateinvar.h                  |    2 +-
 mtree.cpp                          |  391 +++--
 mtree.h                            |  123 +-
 mtreeset.cpp                       |    1 +
 ngs.cpp                            |    2 +-
 node.cpp                           |    6 +-
 node.h                             |   20 +-
 optimization.cpp                   |   29 +-
 pda.cpp                            |  196 ++-
 pdtree.cpp                         |    5 +-
 phyloanalysis.cpp                  |  369 +++--
 phyloanalysis.h                    |    6 +-
 phylokernel.h                      |  904 +++++++-----
 phylokernelavx512.cpp              |  120 ++
 phylokernelfma.cpp                 |  164 +++
 phylokernelmixrate.h               |    2 +-
 phylokernelmixture.h               |    2 +-
 phylokernelnew.h                   | 2802 ++++++++++++++++++++++++++++++++++++
 phylokernel.h => phylokernelsafe.h | 1155 +++++++++------
 phylokernelsitemodel.cpp           |    6 +-
 phylokernelsse.cpp                 |  169 +++
 phylonode.cpp                      |   49 +-
 phylonode.h                        |   22 +-
 phylosupertree.cpp                 |   76 +-
 phylosupertree.h                   |    7 +-
 phylosupertreeplen.cpp             |  164 ++-
 phylosupertreeplen.h               |    9 +-
 phylotesting.cpp                   |  316 +++-
 phylotesting.h                     |   21 +-
 phylotree.cpp                      |  874 ++++++-----
 phylotree.h                        |  400 +++--
 phylotreeavx.cpp                   |  203 +--
 phylotreepars.cpp                  |  354 +++--
 phylotreesse.cpp                   | 2647 ++++++++++------------------------
 pllnni.cpp                         |   53 +-
 pllnni.h                           |   22 +-
 quartet.cpp                        |    2 +-
 split.cpp                          |    6 +-
 split.h                            |    7 +-
 splitgraph.cpp                     |    2 +-
 splitgraph.h                       |    2 +-
 splitset.cpp                       |    6 +-
 splitset.h                         |    6 +-
 stoprule.cpp                       |   44 +-
 stoprule.h                         |   19 +-
 superalignment.cpp                 |   66 +-
 superalignment.h                   |   19 +
 test_scripts/README                |   22 +-
 test_scripts/compile.sh            |  113 +-
 test_scripts/gen_test_standard.py  |   34 +-
 test_scripts/generate_test_cmds.py |   97 --
 test_scripts/run_tests.sh          |   50 +
 test_scripts/submit_jobs.sh        |    8 +-
 test_scripts/submitjob.sh          |    2 -
 test_scripts/test_configs.txt      |   18 +-
 test_scripts/test_data/d59_8.nex   |    2 -
 tools.cpp                          |  339 ++++-
 tools.h                            |  180 ++-
 vectorclass/changelog.txt          |   32 +-
 vectorclass/dispatch_example.cpp   |   12 +-
 vectorclass/instrset.h             |   60 +-
 vectorclass/instrset_detect.cpp    |   18 +-
 vectorclass/special.zip            |  Bin 34477 -> 34897 bytes
 vectorclass/vectorclass.h          |   10 +-
 vectorclass/vectorclass.pdf        |  Bin 476370 -> 431608 bytes
 vectorclass/vectorf128.h           |   46 +-
 vectorclass/vectorf256.h           |   14 +-
 vectorclass/vectorf256e.h          |   13 +-
 vectorclass/vectorf512.h           |   22 +-
 vectorclass/vectorf512e.h          |   13 +-
 vectorclass/vectori128.h           |  140 +-
 vectorclass/vectori256.h           |   28 +-
 vectorclass/vectori256e.h          |   21 +-
 vectorclass/vectori512.h           |   31 +-
 vectorclass/vectori512e.h          |   25 +-
 vectorclass/vectormath_common.h    |   93 +-
 vectorclass/vectormath_exp.h       |   27 +-
 vectorclass/vectormath_hyp.h       |    9 +-
 vectorclass/vectormath_lib.h       |  103 +-
 vectorclass/vectormath_trig.h      |  204 +--
 vectorf64.h                        |  377 +++++
 124 files changed, 14411 insertions(+), 5890 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 73688ca..767f309 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 ##################################################################
 # IQ-TREE cmake build definition
-# Copyright (c) 2012-2015 Bui Quang Minh, Lam Tung Nguyen
+# Copyright (c) 2012-2015 Bui Quang Minh, Lam-Tung Nguyen
 ##################################################################
 
 # Windows example usages:
@@ -18,12 +18,17 @@
 # cmake -DIQTREE_FLAGS="m32" <source_dir>      (32-bit sequential version)
 # cmake -DIQTREE_FLAGS="m32 omp" <source_dir>  (32-bit OpenMP version)
 #
-
+# To compile with CLANG on Linux:
+# export CC=/usr/bin/clang
+# export CXX=/usr/bin/clang++
+# Best practices for setting up CMAKE for diffrent compiler can be found here:
+# http://stackoverflow.com/questions/7031126/switching-between-gcc-and-clang-llvm-using-cmake
+#
 # Mac OSX example usages:
 #------------------------
 # cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ <source_dir>							(sequential version)
 #
-# To build OpenMP version one needs to download Clang version 3.7 or later (as of November 2015) 
+# To build OpenMP version one needs to download Clang version 3.7 or later (as of November 2015)
 # Then assuming clang3.7 and clang++3.7 are the newly built compilers, then:
 # cmake -DCMAKE_C_COMPILER=clang3.7 -DCMAKE_CXX_COMPILER=clang++3.7 -DIQTREE_FLAGS="omp" <source_dir>		(OpenMP version)
 #
@@ -44,23 +49,30 @@ project(iqtree)
 add_definitions(-DIQ_TREE)
 # The version number.
 set (iqtree_VERSION_MAJOR 1)
-set (iqtree_VERSION_MINOR 4)
-set (iqtree_VERSION_PATCH "4") 
+set (iqtree_VERSION_MINOR 5)
+set (iqtree_VERSION_PATCH "2")
 
 set(BUILD_SHARED_LIBS OFF)
 
+if (CMAKE_C_COMPILER MATCHES "mpi")
+    set(IQTREE_FLAGS "${IQTREE_FLAGS} mpi")
+endif()
+
 message("IQ-TREE flags : ${IQTREE_FLAGS}")
 
-if (NOT CMAKE_BUILD_TYPE) 
+if (NOT CMAKE_BUILD_TYPE)
 	set(CMAKE_BUILD_TYPE "Release")
 endif()
 
-if (CMAKE_BUILD_TYPE STREQUAL "Release") 
+if (CMAKE_BUILD_TYPE STREQUAL "Release")
 	message("Builde mode   : Release")
 endif()
 
-include_directories("${PROJECT_SOURCE_DIR}")
+if (CMAKE_GENERATOR MATCHES "Xcode")
+    set(CMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT "dwarf-with-dsym")
+endif()
 
+include_directories("${PROJECT_SOURCE_DIR}")
 
 ##################################################################
 # Detect target platforms
@@ -73,18 +85,18 @@ if (WIN32)
     endif()
     SET(CMAKE_FIND_LIBRARY_SUFFIXES .lib .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
     add_definitions(-DWIN32)
-elseif (APPLE) 
+elseif (APPLE)
 	message("Target OS     : Mac OS X")
 	# to be compatible back to Mac OS X 10.6
-	if (IQTREE_FLAGS MATCHES "oldmac") 
-		add_definitions("-mmacosx-version-min=10.5") 
+	if (IQTREE_FLAGS MATCHES "oldmac")
+		add_definitions("-mmacosx-version-min=10.5")
 		set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -mmacosx-version-min=10.5")
 	else()
-		add_definitions("-mmacosx-version-min=10.6") 
+		add_definitions("-mmacosx-version-min=10.6")
 		set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -mmacosx-version-min=10.6")
 	endif()
     SET(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
-elseif (UNIX) 
+elseif (UNIX)
 	message("Target OS     : Unix")
 	# build as static binary to run on most machines
     if (NOT IQTREE_FLAGS MATCHES "static")
@@ -105,38 +117,48 @@ set(GCC "FALSE")   #  GNU compiler
 set(CLANG "FALSE") # Clang compiler
 set(ICC "FALSE")   # Intel compiler
 set(VCC "FALSE")   # MS Visual C Compiler, note that it is different from MSVC variable
+# using C++11 standard
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
-if (CMAKE_COMPILER_IS_GNUCXX) 	
+if (CMAKE_COMPILER_IS_GNUCXX)
 	message("Compiler      : GNU Compiler (gcc)")
 	set(GCC "TRUE")
 #	set(COMBINED_FLAGS "-Wall -Wno-unused-function -Wno-sign-compare -pedantic -D_GNU_SOURCE -fms-extensions -Wno-deprecated")
-#	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++98")
 	set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g")
 	set(CMAKE_C_FLAGS_RELEASE "-O3 -g")
-    # require at least gcc 4.6
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.6)
-        message(FATAL_ERROR "GCC version must be at least 4.6!")
+    # require at least gcc 4.8
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
+        message(FATAL_ERROR "GCC version must be at least 4.8!")
+    endif()
+    if (WIN32)
+        # disable AVX on Windows due to memory alignment
+        set(IQTREE_FLAGS "${IQTREE_FLAGS} novx")
+        message("WARNING: AVX is disabled on Windows as GCC does not properly suport memory alignment")
     endif()
 elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	message("Compiler      : Clang")
 	set(CLANG "TRUE")
 #	set(COMBINED_FLAGS "-Wall -Wno-unused-function -Wno-sign-compare -pedantic -D_GNU_SOURCE -Wno-nested-anon-types")
+    #if (APPLE AND NOT CMAKE_BUILD_TYPE MATCHES "Debug")
+    #    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
+    #    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++")
+    #endif()
 	set(CMAKE_CXX_FLAGS_RELEASE "-O3")
-	set(CMAKE_C_FLAGS_RELEASE "-O3")	
+	set(CMAKE_C_FLAGS_RELEASE "-O3")
 elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
 	set(VCC "TRUE")
 	message("Compiler      : MS Visual C++ Compiler")
 elseif (CMAKE_CXX_COMPILER_ID MATCHES "Intel")
 	message("Compiler      : Intel C++ Compiler (icc)")
 	set(ICC "TRUE")
-	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Qstd=c99")
+    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Qstd=c99")
 else()
 	message("Compiler      : Unknown and untested yet")
 endif()
 
 set(EXE_SUFFIX "")
 
-if (MSVC) 
+if (MSVC)
 	# MS Visual Studio environment
 	message("Exporting MS Visual Studio projects...")
 	add_definitions(/MP) # enable multi-processor compilation
@@ -145,11 +167,29 @@ if (MSVC)
 		if (VCC)
 			add_definitions(/O2)
 		elseif (ICC)
+            #add_definitions(/O3)
 			add_definitions(/O3)
 		endif()
 	endif()
 endif()
 
+##################################################################
+# configure MPI compilation
+##################################################################
+
+if (IQTREE_FLAGS MATCHES "mpi")
+    add_definitions(-D_IQTREE_MPI)
+    if (NOT CMAKE_CXX_COMPILER MATCHES "mpi")
+        # if not using the MPI compiler wrapper, set own options manually 
+        find_package(MPI REQUIRED)
+        set(CMAKE_CXX_COMPILE_FLAGS "${CMAKE_CXX_COMPILE_FLAGS} ${MPI_CXX_COMPILE_FLAGS}")
+        set(CMAKE_C_COMPILE_FLAGS "${CMAKE_C_COMPILE_FLAGS} ${MPI_C_COMPILE_FLAGS}")
+        set(CMAKE_CXX_LINK_FLAGS "${CMAKE_CXX_LINK_FLAGS} ${MPI_CXX_LINK_FLAGS}")
+        set(CMAKE_C_LINK_FLAGS "${CMAKE_C_LINK_FLAGS} ${MPI_C_LINK_FLAGS}")
+        include_directories(${MPI_C_INCLUDE_PATH})
+        include_directories(${MPI_CXX_INCLUDE_PATH})
+    endif()
+endif()
 
 
 ##################################################################
@@ -170,7 +210,7 @@ if(CMAKE_SIZEOF_VOID_P EQUAL 4 OR IQTREE_FLAGS MATCHES "m32")
 	if (CMAKE_GENERATOR MATCHES "Win64")
 		error("Both 32-bit and 64-bit mode cannot be specified")
 	endif()
-	SET(EXE_SUFFIX "${EXE_SUFFIX}32")
+	#SET(EXE_SUFFIX "${EXE_SUFFIX}32")
 	if (GCC OR CLANG) 
 		set(COMBINED_FLAGS "${COMBINED_FLAGS} -m32")
   	endif()
@@ -179,7 +219,7 @@ else()
 	message("Target binary : 64-bit")
 endif()
 
-if(IQTREE_FLAGS MATCHES "novx") 
+if(IQTREE_FLAGS MATCHES "novx")
     add_definitions(-D__NOAVX__)
 endif()
 
@@ -188,44 +228,52 @@ endif()
 # change the executable name if compiled for OpenMP parallel version
 ##################################################################
 if (IQTREE_FLAGS MATCHES "omp")
-	message("Parallel      : OpenMP/PThreads")
-	SET(EXE_SUFFIX "${EXE_SUFFIX}-omp")	
+	message("OpenMP        : Yes")
+	SET(EXE_SUFFIX "${EXE_SUFFIX}-omp")
 	add_definitions(-D_USE_PTHREADS)
-	if (MSVC) 
+	if (MSVC)
 		add_definitions(/MT)
 	endif()
-	
-	if (VCC) 
+
+	if (VCC)
   		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
-  		include_directories("${PROJECT_SOURCE_DIR}/pll") # for PThreads headers 
+  		include_directories("${PROJECT_SOURCE_DIR}/pll") # for PThreads headers
 	elseif (ICC)
   		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qopenmp")
   		if (WIN32)
   			include_directories("${PROJECT_SOURCE_DIR}/pll") # for PThreads headers
-  		endif() 
+  		endif()
   	elseif (GCC)
 		set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
-  		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-  	elseif (CLANG) 
+  		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread")
+  	elseif (CLANG)
 		set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
   		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp=libomp")
   	endif()
 else()
-	message("Parallel      : None")
+	message("OpenMP        : NONE")
+endif()
+
+
+if (IQTREE_FLAGS MATCHES "mpi")
+	message("MPI           : Yes")
+	SET(EXE_SUFFIX "${EXE_SUFFIX}-mpi")
+else()
+	message("MPI           : NONE")
 endif()
 
 ##################################################################
 # configure SSE/AVX/FMA instructions
 ##################################################################
 
-SET(AVX_FLAGS "-D__AVX")
+SET(AVX_FLAGS "-D__SSE3 -D__AVX")
 if (VCC) 
 	set(AVX_FLAGS "${AVX_FLAGS} /arch:AVX")
 elseif (CLANG)
 	set(AVX_FLAGS "${AVX_FLAGS} -mavx")
 elseif (GCC)
 	set(AVX_FLAGS "${AVX_FLAGS} -mavx -fabi-version=0")
-elseif (ICC) 
+elseif (ICC)
 	if (WIN32)
 		 set(AVX_FLAGS "${AVX_FLAGS} /arch:AVX")
 	else()
@@ -233,47 +281,67 @@ elseif (ICC)
 	endif()
 endif()
 
-SET(SSE_FLAGS "")
+SET(SSE_FLAGS "-D__SSE3")
 if (VCC)
-	set(SSE_FLAGS "/arch:SSE2 -D__SSE3__")
+	set(SSE_FLAGS "${SSE_FLAGS} /arch:SSE2 -D__SSE3__")
 elseif (GCC OR CLANG)
-	set(SSE_FLAGS "-msse3")
+	set(SSE_FLAGS "${SSE_FLAGS} -msse3")
 elseif (ICC)
 	if (WIN32)
-		set(SSE_FLAGS "/arch:SSE3")
+		set(SSE_FLAGS "${SSE_FLAGS} /arch:SSE3")
 	else()
-		set(SSE_FLAGS "-msse3")
+		set(SSE_FLAGS "${SSE_FLAGS} -msse3")
 	endif()
 endif()
 
-if (IQTREE_FLAGS MATCHES "fma") # AVX+FMA instruction set
- 	message("Vectorization : AVX+FMA")
-	add_definitions(-D__SSE3 -D__AVX) # define both SSE3 and AVX directive
-	if (VCC)
-		# Visual C++ has no /mfma flag!, FMA is only included in AVX2 
-		set(COMBINED_FLAGS "${COMBINED_FLAGS} /arch:AVX2")
-	elseif (CLANG)
-		set(COMBINED_FLAGS "${COMBINED_FLAGS} -mavx -mfma")
-	elseif (GCC)
-		set(COMBINED_FLAGS "${COMBINED_FLAGS} -mavx -fabi-version=0 -mfma")
-	elseif (ICC) 
-		if (WIN32)
-			 set(COMBINED_FLAGS "${COMBINED_FLAGS} /arch:AVX /Qfma")
-		else()
-			 set(COMBINED_FLAGS "${COMBINED_FLAGS} -mavx -mfma")
-		endif()
+SET(FMA_FLAGS "-D__SSE3 -D__AVX")
+if (VCC) 
+	set(FMA_FLAGS "${FMA_FLAGS} /arch:AVX2")
+elseif (CLANG)
+	set(FMA_FLAGS "${FMA_FLAGS} -mavx -mfma")
+elseif (GCC)
+	set(FMA_FLAGS "${FMA_FLAGS} -mavx -fabi-version=0 -mfma")
+elseif (ICC) 
+	if (WIN32)
+		 set(FMA_FLAGS "${FMA_FLAGS} /arch:AVX /Qfma")
+	else()
+		 set(FMA_FLAGS "${FMA_FLAGS} -march=core-avx2")
 	endif()
+endif()
 
-	SET(EXE_SUFFIX "${EXE_SUFFIX}-fma")
+SET(AVX512_FLAGS "-D__SSE3 -D__AVX")
+if (VCC)
+    message("AVX512 not available in Visual C++")
+	#set(AVX512_FLAGS "${AVX512_FLAGS} /arch:AVX512")
+elseif (CLANG)
+	set(AVX512_FLAGS "${AVX512_FLAGS} -mavx512f -mfma")
+elseif (GCC)
+	set(AVX512_FLAGS "${AVX512_FLAGS} -mavx512f -mfma")
+elseif (ICC) 
+	if (WIN32)
+		 set(AVX512_FLAGS "${AVX512_FLAGS} /arch:MIC-AVX512 /Qfma")
+	else()
+		 set(AVX512_FLAGS "${AVX512_FLAGS} -xMIC-AVX512 -mfma")
+	endif()
+endif()
 
+
+# further flag to improve performance
+
+if (IQTREE_FLAGS MATCHES "fma") # AVX+FMA instruction set
+ 	message("Vectorization : AVX+FMA")
+	add_definitions(-D__SSE3 -D__AVX) # define both SSE3 and AVX directive
+    set(COMBINED_FLAGS "${COMBINED_FLAGS} ${FMA_FLAGS}")
+	#SET(EXE_SUFFIX "${EXE_SUFFIX}-fma")
 elseif (IQTREE_FLAGS MATCHES "avx") # AVX instruction set
  	message("Vectorization : AVX")
 	add_definitions(-D__SSE3 -D__AVX) # define both SSE3 and AVX directive
 	set(COMBINED_FLAGS "${COMBINED_FLAGS} ${AVX_FLAGS}")
-	SET(EXE_SUFFIX "${EXE_SUFFIX}-avx")
-else() #SSE intruction set
-	message("Vectorization : SSE3")
-	add_definitions(-D__SSE3)
+	#SET(EXE_SUFFIX "${EXE_SUFFIX}-avx")
+elseif (NOT IQTREE_FLAGS MATCHES "nosse") #SSE intruction set
+	message("Vectorization : SSE3/AVX/AVX2")
+	#add_definitions(-D__SSE3)
+    #set(COMBINED_FLAGS "${COMBINED_FLAGS} ${SSE_FLAGS}")
 endif()
 
 
@@ -283,21 +351,26 @@ endif()
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMBINED_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMBINED_FLAGS}")
+set(CMAKE_CXX_FLAGS_PROFILE "${CMAKE_CXX_FLAGS} -fno-inline-functions -fno-inline-functions-called-once -fno-optimize-sibling-calls -fno-default-inline -fno-inline -O2 -fno-omit-frame-pointer -g")
+set(CMAKE_C_FLAGS_PROFILE "${CMAKE_C_FLAGS} -fno-inline-functions -fno-inline-functions-called-once -fno-optimize-sibling-calls -O2 -fno-omit-frame-pointer -g")
 
 if (CMAKE_BUILD_TYPE STREQUAL "Release")
-	message("C flags    : ${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_RELEASE}") 
-	message("CXX flags  : ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}") 
+	message("C flags       : ${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_RELEASE}")
+	message("CXX flags     : ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
 endif()
 
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
-	message("C flags    : ${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_DEBUG}") 
-	message("CXX flags  : ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}") 
+	message("C flags       : ${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_DEBUG}")
+	message("CXX flags     : ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
+endif()
+
+if (CMAKE_BUILD_TYPE STREQUAL "Profile")
+	message("C flags       : ${CMAKE_C_FLAGS_PROFILE} ")
+	message("CXX flags     : ${CMAKE_CXX_FLAGS_PROFILE} ")
 endif()
 
-set(CMAKE_CXX_FLAGS_PROFILE "-fno-inline-functions -fno-inline-functions-called-once -fno-optimize-sibling-calls -fno-default-inline -fno-inline -O0 -fno-omit-frame-pointer -pg")
-set(CMAKE_C_FLAGS_PROFILE "-fno-inline-functions -fno-inline-functions-called-once -fno-optimize-sibling-calls -O0 -fno-omit-frame-pointer -pg")
 
-if (GCC) 
+if (GCC)
 	set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fno-inline-functions -fno-inline-functions-called-once -fno-default-inline -fno-inline")
 	set(CMAKE_C_FLAGS_DEBUG "-O0 -g -fno-inline-functions -fno-inline-functions-called-once -fno-default-inline -fno-inline")
 	set(CMAKE_CXX_FLAGS_MEM "-g -O1")
@@ -317,6 +390,7 @@ check_function_exists (gettimeofday HAVE_GETTIMEOFDAY)
 check_function_exists (getrusage HAVE_GETRUSAGE)
 check_function_exists (GlobalMemoryStatusEx HAVE_GLOBALMEMORYSTATUSEX)
 check_function_exists (strndup HAVE_STRNDUP)
+find_package(Backtrace)
 
 # configure a header file to pass some of the CMake settings
 # to the source code
@@ -334,7 +408,7 @@ include_directories("${PROJECT_BINARY_DIR}")
 
 
 ##################################################################
-# subdirectories containing necessary libraries for the build 
+# subdirectories containing necessary libraries for the build
 ##################################################################
 add_subdirectory(pll)
 add_subdirectory(ncl)
@@ -355,73 +429,88 @@ add_subdirectory(gsl)
 # the main executable
 ##################################################################
 
+add_library(kernelsse phylokernelsse.cpp)
+
 if (NOT BINARY32 AND NOT IQTREE_FLAGS MATCHES "novx")
-add_library(avxkernel phylotreeavx.cpp)
+add_library(kernelavx phylotreeavx.cpp)
+add_library(kernelfma phylokernelfma.cpp)
+if (IQTREE_FLAGS MATCHES "512")
+    add_library(kernelavx512 phylokernelavx512.cpp)
+    add_definitions(-DINCLUDE_AVX512)
+endif()
+endif()
+
+if (IQTREE_FLAGS MATCHES "mpi")
+	add_library(mympi TreeCollection.cpp ObjectStream.cpp)
 endif()
 
 add_executable(iqtree
-alignment.cpp
-alignmentpairwise.cpp
-circularnetwork.cpp
-eigendecomposition.cpp
-greedy.cpp
-gss.cpp
-#guidedbootstrap.cpp
-gurobiwrapper.cpp
-gzstream.cpp
-hashsplitset.cpp
-iqtree.cpp
-maalignment.cpp
-matree.cpp
-mexttree.cpp
-mpdablock.cpp
-msetsblock.cpp
-msplitsblock.cpp
-modelsblock.cpp
-mtree.cpp
-mtreeset.cpp
-ncbitree.cpp
-ngs.cpp
-node.cpp
-optimization.cpp
-parsmultistate.cpp
-pattern.cpp
+alignment.cpp alignment.h
+alignmentpairwise.cpp alignmentpairwise.h
+circularnetwork.cpp circularnetwork.h
+eigendecomposition.cpp eigendecomposition.h
+greedy.cpp greedy.h
+gss.cpp gss.h
+gurobiwrapper.cpp gurobiwrapper.h
+gzstream.cpp gzstream.h
+hashsplitset.cpp hashsplitset.h
+iqtree.cpp iqtree.h
+maalignment.cpp maalignment.h
+matree.cpp matree.h
+mexttree.cpp mexttree.h
+mpdablock.cpp mpdablock.h
+msetsblock.cpp msetsblock.h
+msplitsblock.cpp msplitsblock.h
+modelsblock.cpp modelsblock.h
+mtree.cpp mtree.h
+mtreeset.cpp mtreeset.h
+ncbitree.cpp ncbitree.h
+ngs.cpp ngs.h
+node.cpp node.h
+optimization.cpp optimization.h
+parsmultistate.cpp parsmultistate.h
+pattern.cpp pattern.h
 pda.cpp
-pdnetwork.cpp
-pdtree.cpp
-pdtreeset.cpp
-phyloanalysis.cpp
-phylonode.cpp
-phylosupertree.cpp
-phylotree.cpp
-phylotreesse.cpp
+pdnetwork.cpp pdnetwork.h
+pdtree.cpp pdtree.h
+pdtreeset.cpp pdtreeset.h
+phyloanalysis.cpp phyloanalysis.h
+phylonode.cpp phylonode.h
+phylosupertree.cpp phylosupertree.h
+phylotree.cpp phylotree.h
+phylotreesse.cpp phylokernelnew.h
 phylotreepars.cpp
-phylokernelsitemodel.cpp
-#phylotreeavx.cpp
-pruning.cpp
+pruning.cpp pruning.h
 quartet.cpp
-split.cpp
-splitgraph.cpp
-splitset.cpp
-stoprule.cpp
-superalignment.cpp
-superalignmentpairwise.cpp
-supernode.cpp
-tinatree.cpp
-tools.cpp
-whtest_wrapper.cpp
-lpwrapper.c
-pllnni.cpp
-phylosupertreeplen.cpp
-phylotesting.cpp
-ecopd.cpp
-ecopdmtreeset.cpp
-graph.cpp
-candidateset.cpp
-checkpoint.cpp
-upperbounds.cpp
+split.cpp split.h
+splitgraph.cpp splitgraph.h
+splitset.cpp splitset.h
+stoprule.cpp stoprule.h
+superalignment.cpp superalignment.h
+superalignmentpairwise.cpp superalignmentpairwise.h
+supernode.cpp supernode.h
+tinatree.cpp tinatree.h
+tools.cpp tools.h
+whtest_wrapper.cpp whtest_wrapper.h
+lpwrapper.c lpwrapper.h
+pllnni.cpp pllnni.h
+phylosupertreeplen.cpp phylosupertreeplen.h
+phylotesting.cpp phylotesting.h
+ecopd.cpp ecopd.h
+ecopdmtreeset.cpp ecopdmtreeset.h
+graph.cpp graph.h
+candidateset.cpp candidateset.h
+checkpoint.cpp checkpoint.h
+constrainttree.cpp constrainttree.h
+MPIHelper.cpp MPIHelper.h
+memslot.cpp memslot.h
 )
 
+if(Backtrace_FOUND)
+  include_directories(${Backtrace_INCLUDE_DIR})
+  target_link_libraries(iqtree ${Backtrace_LIBRARY})
+endif(Backtrace_FOUND)
+
 if (NOT IQTREE_FLAGS MATCHES "nozlib")
     find_package(ZLIB)
 endif()
@@ -438,18 +527,25 @@ else(ZLIB_FOUND)
 endif(ZLIB_FOUND)
 
 if (NOT IQTREE_FLAGS MATCHES "avx" AND NOT IQTREE_FLAGS MATCHES "fma")
-	set_target_properties(iqtree pll ncl lbfgsb whtest sprng vectorclass model PROPERTIES COMPILE_FLAGS "${SSE_FLAGS}")
+    if (NOT IQTREE_FLAGS MATCHES "nosse")
+        set_target_properties(iqtree ncl lbfgsb whtest sprng vectorclass model PROPERTIES COMPILE_FLAGS "${SSE_FLAGS}")
+    endif()
+    set_target_properties(kernelsse pll PROPERTIES COMPILE_FLAGS "${SSE_FLAGS}")
 	if (NOT BINARY32 AND NOT IQTREE_FLAGS MATCHES "novx")
-		set_target_properties(avxkernel pllavx PROPERTIES COMPILE_FLAGS "${AVX_FLAGS}")
+		set_target_properties(kernelavx pllavx PROPERTIES COMPILE_FLAGS "${AVX_FLAGS}")
+		set_target_properties(kernelfma PROPERTIES COMPILE_FLAGS "${FMA_FLAGS}")
+        if (IQTREE_FLAGS MATCHES "512")
+            set_target_properties(kernelavx512 PROPERTIES COMPILE_FLAGS "${AVX512_FLAGS}")
+        endif()
 	endif()
-endif()  
+endif()
 
 ##################################################################
 # setup linking flags
 ##################################################################
 
 # link special lib for WIN32
-if (WIN32) 
+if (WIN32)
 	set(PLATFORM_LIB "ws2_32")
 else()
 	set(PLATFORM_LIB "m")
@@ -460,10 +556,10 @@ if(CLANG AND WIN32 AND IQTREE_FLAGS MATCHES "static")
 endif()
 
 set(THREAD_LIB "")
-if (IQTREE_FLAGS MATCHES "omp") 
+if (IQTREE_FLAGS MATCHES "omp")
 	link_directories(${PROJECT_SOURCE_DIR}/lib)
 	if (MSVC)
-		if (BINARY32) 
+		if (BINARY32)
             set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /LIBPATH:${PROJECT_SOURCE_DIR}/lib32")
 			set(THREAD_LIB "pthreadVC2")
 		else()
@@ -471,9 +567,9 @@ if (IQTREE_FLAGS MATCHES "omp")
 			set(THREAD_LIB "pthreadVC2")
 		endif()
 	elseif(CLANG AND APPLE)
-		set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${PROJECT_SOURCE_DIR}/libmac")
+		set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${PROJECT_SOURCE_DIR}/libmac -fopenmp=libomp")
 	elseif(CLANG AND WIN32)
-        if (BINARY32) 
+        if (BINARY32)
             set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${PROJECT_SOURCE_DIR}/lib32 libiomp5md.dll")
         else()
             set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${PROJECT_SOURCE_DIR}/lib libiomp5md.dll")
@@ -482,14 +578,30 @@ if (IQTREE_FLAGS MATCHES "omp")
 	endif()
 endif()
 
-if (BINARY32 OR IQTREE_FLAGS MATCHES "novx")
-    target_link_libraries(iqtree pll ncl lbfgsb whtest sprng vectorclass model gsl ${PLATFORM_LIB} ${STD_LIB} ${THREAD_LIB})	
-else()
-    target_link_libraries(iqtree pll pllavx ncl lbfgsb whtest sprng vectorclass model avxkernel gsl ${PLATFORM_LIB} ${STD_LIB} ${THREAD_LIB})	
+# basic linking librararies
+target_link_libraries(iqtree pll ncl lbfgsb whtest sprng vectorclass model gsl ${PLATFORM_LIB} ${STD_LIB} ${THREAD_LIB})
+
+if (NOT IQTREE_FLAGS MATCHES "nosse")
+    target_link_libraries(iqtree kernelsse)
 endif()
 
-##################################################################
-# setup the executable name 
+# MPI libraries
+if (IQTREE_FLAGS MATCHES "mpi")
+    target_link_libraries(iqtree mympi)
+    if (NOT CMAKE_CXX_COMPILER MATCHES "mpi")
+        target_link_libraries(iqtree ${MPI_CXX_LIBRARIES})
+    endif()
+endif()
+
+# SSE, AVX etc. libraries
+if (NOT BINARY32 AND NOT IQTREE_FLAGS MATCHES "novx")
+    target_link_libraries(iqtree pllavx kernelavx kernelfma)
+    if (IQTREE_FLAGS MATCHES "512")
+        target_link_libraries(iqtree kernelavx512)
+    endif()
+endif()
+
+# setup the executable name
 ##################################################################
 set_target_properties(iqtree PROPERTIES OUTPUT_NAME "iqtree${EXE_SUFFIX}")
 
@@ -497,7 +609,7 @@ set_target_properties(iqtree PROPERTIES OUTPUT_NAME "iqtree${EXE_SUFFIX}")
 if (CMAKE_BUILD_TYPE STREQUAL "Release" AND (GCC OR CLANG)) # strip is not necessary for MSVC
 	if (WIN32)
 		ADD_CUSTOM_COMMAND(TARGET iqtree POST_BUILD COMMAND strip $<TARGET_FILE:iqtree>)
-	else()
+	elseif (NOT APPLE)
 		ADD_CUSTOM_COMMAND(TARGET iqtree POST_BUILD COMMAND ${CMAKE_STRIP} $<TARGET_FILE:iqtree>)
 	endif()
 endif()
@@ -509,11 +621,11 @@ else()
 endif()
 
 if (WIN32)
-	if (MSVC) 
+	if (MSVC)
 		ADD_CUSTOM_COMMAND(TARGET iqtree POST_BUILD COMMAND copy "Release\\iqtree${EXE_SUFFIX}.exe" "Release\\iqtree${EXE_SUFFIX}-click.exe")
 	else()
 		ADD_CUSTOM_COMMAND(TARGET iqtree POST_BUILD COMMAND copy "iqtree${EXE_SUFFIX}.exe" "iqtree${EXE_SUFFIX}-click.exe")
-	endif()	
+	endif()
 endif()
 
 ##############################################################
@@ -552,7 +664,7 @@ endif()
 # build a CPack driven installer package
 ##############################################################
 include (InstallRequiredSystemLibraries)
-set (CPACK_RESOURCE_FILE_LICENSE  
+set (CPACK_RESOURCE_FILE_LICENSE
      "${CMAKE_CURRENT_SOURCE_DIR}/License.txt")
 set (CPACK_PACKAGE_VERSION_MAJOR "${iqtree_VERSION_MAJOR}")
 set (CPACK_PACKAGE_VERSION_MINOR "${iqtree_VERSION_MINOR}")
@@ -572,25 +684,19 @@ set(CPACK_SOURCE_IGNORE_FILES
 
 set (SYSTEM_NAME "${CMAKE_SYSTEM_NAME}")
 if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")
-	if (IQTREE_FLAGS MATCHES "oldmac") 
+	if (IQTREE_FLAGS MATCHES "oldmac")
 		set (SYSTEM_NAME "MacOS10.5")
-	else() 
+	else()
 		set (SYSTEM_NAME "MacOSX")
 	endif()
 endif()
 
 if (BINARY32) 
-    set (PROJECT_NAME_SUFFIX "${EXE_SUFFIX}")
-else()
-    set (PROJECT_NAME_SUFFIX "${EXE_SUFFIX}")
+    set (SYSTEM_NAME "${SYSTEM_NAME}32")
 endif()
 
-#if (NOT IQTREE_FLAGS MATCHES "omp" AND NOT IQTREE_FLAGS MATCHES "avx" AND NOT IQTREE_FLAGS MATCHES "fma")  
-#	set (PROJECT_NAME_SUFFIX "${PROJECT_NAME_SUFFIX}-sse") 
-#endif()
-
 set(CPACK_PACKAGE_FILE_NAME 
-	"${CMAKE_PROJECT_NAME}${PROJECT_NAME_SUFFIX}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}-${SYSTEM_NAME}")
+	"${CMAKE_PROJECT_NAME}${EXE_SUFFIX}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}-${SYSTEM_NAME}")
 
 set(CPACK_STRIP_FILES TRUE)
 
diff --git a/MPIHelper.cpp b/MPIHelper.cpp
new file mode 100644
index 0000000..a950e8b
--- /dev/null
+++ b/MPIHelper.cpp
@@ -0,0 +1,560 @@
+//
+// Created by tung on 6/18/15.
+//
+
+#include "MPIHelper.h"
+#include "timeutil.h"
+
+/**
+ *  Initialize the single getInstance of MPIHelper
+ */
+
+MPIHelper& MPIHelper::getInstance() {
+    static MPIHelper instance;
+#ifndef _IQTREE_MPI
+    instance.setProcessID(0);
+    instance.setNumProcesses(1);
+#endif
+    return instance;
+}
+
+void MPIHelper::distributeTrees(vector<string> &treeStrings, vector<double> &scores, int tag) {
+    if (getNumProcesses() == 1)
+        return;
+#ifdef _IQTREE_MPI
+    vector<int> sourceProcID;
+    sourceProcID.insert(sourceProcID.end(), scores.size(), getProcessID());
+    TreeCollection outTrees(treeStrings, scores, sourceProcID);
+    cleanUpMessages();
+    for (int i = 0; i < getNumProcesses(); i++) {
+        if (i != getProcessID()) {
+            MPI_Request *request = new MPI_Request;
+            ObjectStream *os = new ObjectStream(outTrees);
+            MPI_Isend(os->getObjectData(), os->getDataLength(), MPI_CHAR, i, tag, MPI_COMM_WORLD, request);
+            sentMessages.push_back(make_pair(request, os));
+            int flag = 0;
+            MPI_Status status;
+            MPI_Test(request, &flag, &status);
+        }
+    }
+    //numTreeSent += treeStrings.size();
+#endif
+}
+
+void MPIHelper::distributeTree(string treeString, double score, int tag) {
+    if (getNumProcesses() == 1)
+        return;
+#ifdef _IQTREE_MPI
+    double start = getRealTime();
+    vector<string> trees;
+    vector<double> scores;
+    trees.push_back(treeString);
+    scores.push_back(score);
+    distributeTrees(trees, scores, tag);
+    if (verbose_mode >= VB_MED)
+        cout << "Sent tree to other processes in " << getRealTime() - start << " seconds" << endl;
+    numTreeSent++;
+#endif
+}
+
+void MPIHelper::sendTrees(int dest, vector<string> &treeStrings, vector<double> &scores, int tag) {
+    if (getNumProcesses() == 1 || dest == getProcessID())
+        return;
+#ifdef _IQTREE_MPI
+    vector<int> sourceProcID;
+    sourceProcID.insert(sourceProcID.end(), scores.size(), getProcessID());
+    TreeCollection outTrees(treeStrings, scores, sourceProcID);
+    cleanUpMessages();
+    MPI_Request *request = new MPI_Request;
+    ObjectStream *os = new ObjectStream(outTrees);
+    MPI_Isend(os->getObjectData(), os->getDataLength(), MPI_CHAR, dest, tag, MPI_COMM_WORLD, request);
+    sentMessages.push_back(make_pair(request, os));
+    numTreeSent += treeStrings.size();
+
+    int flag = 0;
+    MPI_Status status;
+    MPI_Test(request, &flag, &status);
+#endif
+}
+
+void MPIHelper::sendTree(int dest, string treeString, double score, int tag) {
+    if (getNumProcesses() == 1 || dest == getProcessID())
+        return;
+#ifdef _IQTREE_MPI
+    StrVector treeStrings;
+    treeStrings.push_back(treeString);
+    DoubleVector scores;
+    scores.push_back(score);
+    sendTrees(dest, treeStrings, scores, tag);
+#endif
+}
+
+int MPIHelper::sendRecvTrees(int dest, vector<string> &treeStrings, vector<double> &scores, int tag) {
+    if (getNumProcesses() == 1 || dest == getProcessID())
+        return tag;
+#ifdef _IQTREE_MPI
+    double beginTime = getRealTime();
+    // prepare message
+    vector<int> sourceProcID;
+    sourceProcID.insert(sourceProcID.end(), scores.size(), getProcessID());
+    TreeCollection outTrees(treeStrings, scores, sourceProcID);
+    ObjectStream *os = new ObjectStream(outTrees);
+
+    // blocking send
+    MPI_Send(os->getObjectData(), os->getDataLength(), MPI_CHAR, dest, tag, MPI_COMM_WORLD);
+    numTreeSent += treeStrings.size();
+    delete os;
+
+    // blocking probe
+    MPI_Status status;
+    MPI_Probe(dest, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+    int msgCount;
+    MPI_Get_count(&status, MPI_CHAR, &msgCount);
+
+    // receive the message
+    char *recvBuffer = new char[msgCount];
+    MPI_Recv(recvBuffer, msgCount, MPI_CHAR, status.MPI_SOURCE, status.MPI_TAG, MPI_COMM_WORLD, &status);
+    treeStrings.clear();
+    scores.clear();
+
+    if (status.MPI_TAG != STOP_TAG) {
+        os = new ObjectStream(recvBuffer, msgCount);
+        TreeCollection curTrees = os->getTreeCollection();
+        treeStrings = curTrees.getTreeStrings();
+        scores = curTrees.getScores();
+        numTreeReceived += treeStrings.size();
+    }
+    delete [] recvBuffer;
+
+    double endTime = getRealTime();
+    cout << "INFO: " << endTime - beginTime << " seconds for " << __func__ << endl;
+
+    return status.MPI_TAG;
+#else
+    return tag;
+#endif
+}
+
+int MPIHelper::recvSendTrees(vector<string> &treeStrings, vector<double> &scores, vector<bool> &should_send, int tag) {
+    if (getNumProcesses() == 1)
+        return 0;
+#ifdef _IQTREE_MPI
+    double beginTime = getRealTime();
+    // blocking probe
+    MPI_Status status;
+    MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+    int msgCount;
+    MPI_Get_count(&status, MPI_CHAR, &msgCount);
+    int dest = status.MPI_SOURCE;
+
+    // receive the message
+    char *recvBuffer = new char[msgCount];
+    MPI_Recv(recvBuffer, msgCount, MPI_CHAR, status.MPI_SOURCE, status.MPI_TAG, MPI_COMM_WORLD, &status);
+
+    // now send message
+    if (!should_send[dest]) {
+        treeStrings.resize(1, "notree");
+        scores.resize(1, -DBL_MAX);
+    }
+    IntVector sourceProcID;
+    sourceProcID.insert(sourceProcID.end(), scores.size(), getProcessID());
+    TreeCollection outTrees(treeStrings, scores, sourceProcID);
+    ObjectStream *os = new ObjectStream(outTrees);
+
+    // blocking send
+    MPI_Send(os->getObjectData(), os->getDataLength(), MPI_CHAR, dest, tag, MPI_COMM_WORLD);
+    numTreeSent += treeStrings.size();
+    delete os;
+
+    // now extract trees from received buffer
+    treeStrings.clear();
+    scores.clear();
+    os = new ObjectStream(recvBuffer, msgCount);
+    TreeCollection curTrees = os->getTreeCollection();
+    treeStrings = curTrees.getTreeStrings();
+    scores = curTrees.getScores();
+    delete [] recvBuffer;
+    numTreeReceived += treeStrings.size();
+    
+    should_send[dest] = false;
+
+    double endTime = getRealTime();
+    if (endTime - beginTime > 1)
+        cout << "WARNING: " << endTime - beginTime << " seconds for " << __func__ << endl;
+
+    return dest;
+#else
+    return 0;
+#endif
+}
+
+void MPIHelper::gatherTrees(TreeCollection &trees) {
+    if (getNumProcesses() == 1)
+        return;
+#ifdef _IQTREE_MPI
+    double beginTime = getRealTime();
+
+    if (isMaster()) {
+        trees.clear();
+        // Master: receive from all Workers
+        for (int w = 1; w < getNumProcesses(); w++) {
+            // blocking probe
+            MPI_Status status;
+            MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+            int msgCount;
+            MPI_Get_count(&status, MPI_CHAR, &msgCount);
+            // receive the message
+            char *recvBuffer = new char[msgCount];
+            MPI_Recv(recvBuffer, msgCount, MPI_CHAR, status.MPI_SOURCE, status.MPI_TAG, MPI_COMM_WORLD, &status);
+            ObjectStream *os = new ObjectStream(recvBuffer, msgCount);
+            TreeCollection curTrees = os->getTreeCollection();
+            trees.addTrees(curTrees);
+            numTreeReceived += curTrees.getNumTrees();
+            delete [] recvBuffer;
+        }
+        cout << trees.getNumTrees() << " trees gathered from workers in ";
+    } else {
+        // Worker: send trees to Master
+        ObjectStream *os = new ObjectStream(trees);
+        // blocking send
+        MPI_Send(os->getObjectData(), os->getDataLength(), MPI_CHAR, PROC_MASTER, TREE_TAG, MPI_COMM_WORLD);
+        numTreeSent += trees.getNumTrees();
+        delete os;
+        cout << trees.getNumTrees() << " trees sent to master in ";
+    }
+
+    double endTime = getRealTime();
+    cout << endTime - beginTime << " seconds" << endl;
+#endif
+}
+
+void MPIHelper::broadcastTrees(TreeCollection &trees) {
+    if (getNumProcesses() == 1)
+        return;
+#ifdef _IQTREE_MPI
+    double beginTime = getRealTime();
+
+    // prepare data from Master
+    ObjectStream *os;
+    int msgCount = 0;
+    if (isMaster()) {
+        os = new ObjectStream(trees);
+        msgCount = os->getDataLength();
+    }
+
+    // broadcast the count for workers
+    MPI_Bcast(&msgCount, 1, MPI_INT, PROC_MASTER, MPI_COMM_WORLD);
+
+    char *recvBuffer = new char[msgCount];
+    if (isMaster())
+        memcpy(recvBuffer, os->getObjectData(), msgCount);
+
+    // broadcast trees to workers
+    MPI_Bcast(recvBuffer, msgCount, MPI_CHAR, PROC_MASTER, MPI_COMM_WORLD);
+
+    if (isWorker()) {
+        os = new ObjectStream(recvBuffer, msgCount);
+        trees = os->getTreeCollection();
+    }
+    delete os;
+    delete [] recvBuffer;
+
+    double endTime = getRealTime();
+    cout << trees.getNumTrees() << " trees broadcasted to workers in " << endTime - beginTime << " seconds" << endl;
+
+#endif
+}
+
+
+bool MPIHelper::gotMessage() {
+    // Check for incoming messages
+    if (getNumProcesses() == 1)
+        return false;
+#ifdef _IQTREE_MPI
+    int flag = 0;
+    MPI_Status status;
+    MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, &status);
+    if (flag)
+        return true;
+    else
+        return false;
+#else
+    return false;
+#endif
+}
+
+void MPIHelper::sendMsg(int tag, string msg) {
+    if (getNumProcesses() == 1)
+        return;
+#ifdef _IQTREE_MPI
+    if (tag == STOP_TAG)
+        cleanUpMessages();
+    for (int i = 0; i < getNumProcesses(); i++) {
+        if (i != getProcessID()) {
+            MPI_Request *request = new MPI_Request;
+            ObjectStream *os = new ObjectStream(msg.c_str(), msg.size()+1);
+            MPI_Isend(os->getObjectData(), os->getDataLength(), MPI_CHAR, i, tag, MPI_COMM_WORLD, request);
+            sentMessages.push_back(make_pair(request, os));
+            int flag = 0;
+            MPI_Status status;
+            MPI_Test(request, &flag, &status);
+        }
+    }
+#endif
+}
+
+bool MPIHelper::checkMsg(int tag, string &msg) {
+    if (getNumProcesses() == 1)
+        return true;
+#ifdef _IQTREE_MPI
+    int flag=0;
+    MPI_Status status;
+    char *recvBuffer;
+    int numBytes;
+    // Check for incoming messages
+    MPI_Iprobe(PROC_MASTER, tag, MPI_COMM_WORLD, &flag, &status);
+    // flag == true if there is a message
+    if (flag) {
+        MPI_Get_count(&status, MPI_CHAR, &numBytes);
+        recvBuffer = new char[numBytes];
+        MPI_Recv(recvBuffer, numBytes, MPI_CHAR, status.MPI_SOURCE, status.MPI_TAG, MPI_COMM_WORLD, &status);
+        msg = recvBuffer;
+        delete[] recvBuffer;
+        return true;
+    }
+#endif
+    return false;
+}
+
+bool MPIHelper::checkMsg(int tag) {
+    if (getNumProcesses() == 1) {
+        return false;
+    }
+#ifdef _IQTREE_MPI
+    string msg;
+    if (checkMsg(tag, msg)) {
+        cout << "Worker " << getProcessID() << " gets message " << msg << endl;
+        return true;
+    }
+#endif
+    return false;
+}
+
+
+void MPIHelper::receiveTrees(bool fromAll, int maxNumTrees, TreeCollection &trees, int tag) {
+    if (getNumProcesses() == 1) {
+        return;
+    }
+#ifdef _IQTREE_MPI
+    int flag = 0;
+    int minNumTrees = 0;
+    bool nodes[getNumProcesses()];
+    if (fromAll)
+        minNumTrees = getNumProcesses() - 1;
+    for (int i = 0; i < getNumProcesses(); i++)
+        nodes[i] = false;
+    nodes[getProcessID()] = true;
+    // Process all pending messages
+    MPI_Status status;
+    size_t totalMsgSize = 0;
+    do {
+        char* recvBuffer;
+        int numBytes;
+        flag = 0;
+        // Check for incoming messages
+        MPI_Iprobe(MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &flag, &status);
+        // flag == true if there is a message
+        if (flag) {
+            //cout << "Getting messages from node " << status.MPI_SOURCE << endl;
+            MPI_Get_count(&status, MPI_CHAR, &numBytes);
+            totalMsgSize += numBytes;
+            recvBuffer = new char[numBytes];
+            MPI_Recv(recvBuffer, numBytes, MPI_CHAR, status.MPI_SOURCE, status.MPI_TAG, MPI_COMM_WORLD, &status);
+            ObjectStream os(recvBuffer, numBytes);
+            if (status.MPI_TAG == STOP_TAG) {
+                cout <<  os.getObjectData() << endl;
+                MPI_Finalize();
+                exit(0);
+            }
+            TreeCollection curTrees = os.getTreeCollection();
+            trees.addTrees(curTrees);
+            if (trees.getNumTrees() >= maxNumTrees) {
+                break;
+            }
+            if (fromAll && !nodes[status.MPI_SOURCE]) {
+                nodes[status.MPI_SOURCE] = true;
+                minNumTrees--;
+            }
+            delete [] recvBuffer;
+        }
+    } while (minNumTrees > 0 || flag);
+    numTreeReceived += trees.getNumTrees();
+    if (trees.getNumTrees() > 0) {
+        cout << "Proc " << getProcessID() << ": " << trees.getNumTrees() << " trees received from other processes (" << totalMsgSize << " bytes)" << endl;
+    }
+#endif
+}
+
+int MPIHelper::receiveTrees(TreeCollection &trees, int tag) {
+    if (getNumProcesses() == 1) {
+        return -1;
+    }
+#ifdef _IQTREE_MPI
+    int flag = 0;
+    // Process all pending messages
+    MPI_Status status;
+    char* recvBuffer;
+    int numBytes;
+    // Check for incoming messages
+    MPI_Iprobe(MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &flag, &status);
+    // flag == true if there is a message
+    if (flag) {
+        //cout << "Getting messages from node " << status.MPI_SOURCE << endl;
+        MPI_Get_count(&status, MPI_CHAR, &numBytes);
+        recvBuffer = new char[numBytes];
+        MPI_Recv(recvBuffer, numBytes, MPI_CHAR, status.MPI_SOURCE, status.MPI_TAG, MPI_COMM_WORLD, &status);
+        ObjectStream os(recvBuffer, numBytes);
+        TreeCollection curTrees = os.getTreeCollection();
+        trees.addTrees(curTrees);
+        delete [] recvBuffer;
+        return status.MPI_SOURCE;
+    }
+#endif
+    return -1;
+}
+
+int MPIHelper::cleanUpMessages() {
+#ifdef _IQTREE_MPI
+    int numMsgCleaned = 0;
+    // change iterator to index to avoid iterator being invalidated after erase()
+    for (int i = 0; i < sentMessages.size(); ) {
+        int flag = 0;
+        MPI_Status status;
+        MPI_Test(sentMessages[i].first, &flag, &status);
+        if (flag) {
+            delete sentMessages[i].first;
+            delete sentMessages[i].second;
+            numMsgCleaned++;
+            sentMessages.erase(sentMessages.begin()+i);
+        } else {
+            i++;
+        }
+    }
+    if (verbose_mode >= VB_MED && numMsgCleaned)
+        cout << numMsgCleaned << " messages sent and cleaned up" << endl;
+    return numMsgCleaned;
+#else
+    return 0;
+#endif
+}
+
+#ifdef _IQTREE_MPI
+void MPIHelper::sendString(string &str, int dest, int tag) {
+    char *buf = (char*)str.c_str();
+    MPI_Send(buf, str.length()+1, MPI_CHAR, dest, tag, MPI_COMM_WORLD);
+}
+
+void MPIHelper::sendCheckpoint(Checkpoint *ckp, int dest) {
+    stringstream ss;
+    ckp->dump(ss);
+    string str = ss.str();
+    sendString(str, dest, TREE_TAG);
+}
+
+
+int MPIHelper::recvString(string &str, int src, int tag) {
+    MPI_Status status;
+    MPI_Probe(src, tag, MPI_COMM_WORLD, &status);
+    int msgCount;
+    MPI_Get_count(&status, MPI_CHAR, &msgCount);
+    // receive the message
+    char *recvBuffer = new char[msgCount];
+    MPI_Recv(recvBuffer, msgCount, MPI_CHAR, status.MPI_SOURCE, status.MPI_TAG, MPI_COMM_WORLD, &status);
+    str = recvBuffer;
+    delete [] recvBuffer;
+    return status.MPI_SOURCE;
+}
+
+int MPIHelper::recvCheckpoint(Checkpoint *ckp, int src) {
+    string str;
+    int proc = recvString(str, src, TREE_TAG);
+    stringstream ss(str);
+    ckp->load(ss);
+    return proc;
+}
+
+void MPIHelper::broadcastCheckpoint(Checkpoint *ckp) {
+    int msgCount = 0;
+    stringstream ss;
+    string str;
+    if (isMaster()) {
+        ckp->dump(ss);
+        str = ss.str();
+        msgCount = str.length()+1;
+    }
+
+    // broadcast the count for workers
+    MPI_Bcast(&msgCount, 1, MPI_INT, PROC_MASTER, MPI_COMM_WORLD);
+
+    char *recvBuffer = new char[msgCount];
+    if (isMaster())
+        memcpy(recvBuffer, str.c_str(), msgCount);
+
+    // broadcast trees to workers
+    MPI_Bcast(recvBuffer, msgCount, MPI_CHAR, PROC_MASTER, MPI_COMM_WORLD);
+
+    if (isWorker()) {
+        ss.clear();
+        ss.str(recvBuffer);
+        ckp->load(ss);
+    }
+    delete [] recvBuffer;
+}
+
+void MPIHelper::gatherCheckpoint(Checkpoint *ckp) {
+    stringstream ss;
+    ckp->dump(ss);
+    string str = ss.str();
+    int msgCount = str.length();
+
+    // first send the counts to MASTER
+    int *msgCounts = NULL, *displ = NULL;
+    char *recvBuffer = NULL;
+    int totalCount = 0;
+
+    if (isMaster()) {
+        msgCounts = new int[getNumProcesses()];
+        displ = new int[getNumProcesses()];
+    }
+    MPI_Gather(&msgCount, 1, MPI_INT, msgCounts, 1, MPI_INT, PROC_MASTER, MPI_COMM_WORLD);
+
+    // now real contents to MASTER
+    if (isMaster()) {
+        for (int i = 0; i < getNumProcesses(); i++) {
+            displ[i] = totalCount;
+            totalCount += msgCounts[i];
+        }
+        recvBuffer = new char[totalCount+1];
+        memset(recvBuffer, 0, totalCount+1);
+    }
+    char *buf = (char*)str.c_str();
+    MPI_Gatherv(buf, msgCount, MPI_CHAR, recvBuffer, msgCounts, displ, MPI_CHAR, PROC_MASTER, MPI_COMM_WORLD);
+
+    if (isMaster()) {
+        // now decode the buffer
+        ss.clear();
+        ss.str(recvBuffer);
+        ckp->load(ss);
+
+        delete [] recvBuffer;
+        delete [] displ;
+        delete [] msgCounts;
+    }
+}
+
+#endif
+
+MPIHelper::~MPIHelper() {
+//    cleanUpMessages();
+}
+
diff --git a/MPIHelper.h b/MPIHelper.h
new file mode 100644
index 0000000..615ef48
--- /dev/null
+++ b/MPIHelper.h
@@ -0,0 +1,305 @@
+/***************************************************************************
+ *   Copyright (C) 2015 by                                                 *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef MPIHELPER_H
+#define MPIHELPER_H
+
+#include <string>
+#include <vector>
+#include "tools.h"
+#include "TreeCollection.h"
+#include "ObjectStream.h"
+
+#ifdef _IQTREE_MPI
+
+#include <mpi.h>
+
+#endif
+
+#define PROC_MASTER 0
+#define TREE_TAG 1 // Message contain trees
+#define STOP_TAG 2 // Stop message
+#define BOOT_TAG 3 // Message to please send bootstrap trees
+#define BOOT_TREE_TAG 4 // bootstrap tree tag
+#define LOGL_CUTOFF_TAG 5 // send logl_cutoff for ultrafast bootstrap
+
+using namespace std;
+
+class MPIHelper {
+public:
+    /**
+    *  Singleton method: get one and only one getInstance of the class
+    */
+    static MPIHelper &getInstance();
+
+    /**
+        destructor
+    */
+    ~MPIHelper();
+
+    int getNumProcesses() const {
+        return numProcesses;
+    }
+
+    void setNumProcesses(int numProcesses) {
+        MPIHelper::numProcesses = numProcesses;
+    }
+
+    int getProcessID() const {
+        return processID;
+    }
+
+    bool isMaster() const {
+        return processID == PROC_MASTER;
+    }
+
+    bool isWorker() const {
+        return processID != PROC_MASTER;
+    }
+
+    void setProcessID(int processID) {
+        MPIHelper::processID = processID;
+    }
+
+    /** @return true if got any message from another process */
+    bool gotMessage();
+
+    /**
+     *  Receive trees that sent to the current process
+     *
+     *  @param fromAll
+     *      wait until at least one tree from each remaining process has been received
+     *  @param maxNumTrees
+     *      Only received up to maxNumTrees to prevent the function to block because it can constantly receive
+     *      new trees
+     *  @param trees[OUT]
+     *      Trees received from other processes
+     *  @param tag MPI tag
+     */
+    void receiveTrees(bool fromAll, int maxNumTrees, TreeCollection &trees, int tag);
+
+
+    /**
+     *  Receive trees that sent to the current process
+     *
+     *  @param trees[OUT]
+     *      Trees received from other processes
+     *  @param tag MPI tag
+     *  @return source process ID
+     */
+    int receiveTrees(TreeCollection &trees, int tag);
+
+    /**
+     *   Send trees to all other processes
+     *   @param treeStrings vector of trees
+     *   @param scores vector containing scores of the trees with same order as in treeStrings
+     *   @param tag used to classified the message
+     */
+    void distributeTrees(vector<string> &treeStrings, vector<double> &scores, int tag = TREE_TAG);
+
+    /**
+    *   Similar to distributeTrees but only 1 tree is sent
+    *   @param treeString
+    *   @param score
+    *   @param tag
+    */
+    void distributeTree(string treeString, double score, int tag);
+
+    /**
+     *   Send trees to a dest process
+     *   @param dest MPI rank of destination process
+     *   @param treeStrings vector of trees
+     *   @param scores vector containing scores of the trees with same order as in treeStrings
+     *   @param tag used to classified the message
+     */
+    void sendTrees(int dest, vector<string> &treeStrings, vector<double> &scores, int tag);
+
+    /**
+     *   Send one tree to a dest process
+     *   @param dest MPI rank of destination process
+     *   @param treeString NEWICK tree string
+     *   @param score its score
+     *   @param tag used to classified the message
+     */
+    void sendTree(int dest, string treeString, double score, int tag);
+
+    /**
+     *   Blocking Send and then receive trees with a dest process
+     *   @param dest MPI rank of destination process
+     *   @param[in,out] treeString NEWICK tree string
+     *   @param[in,out] score its score
+     *   @param tag used to classified the message
+     *   return the message tag
+     */
+    int sendRecvTrees(int dest, vector<string> &treeStrings, vector<double> &scores, int tag);
+
+    /**
+     *   Blocking receive and then send trees with a dest process
+     *   @param dest MPI rank of destination process
+     *   @param[in,out] treeString NEWICK tree string
+     *   @param[in,out] score its score
+     *   @param tag used to classified the message
+     *   return the message tag
+     */
+    int recvSendTrees(vector<string> &treeStrings, vector<double> &scores, vector<bool> &should_send, int tag);
+
+    /**
+        gather trees from workers to master
+    */
+    void gatherTrees(TreeCollection &trees);
+
+    /**
+        broadcase trees from master to works
+    */
+    void broadcastTrees(TreeCollection &trees);
+
+    /**
+     *  Send a message to other process, e.g. STOP_TAG
+     */
+    void sendMsg(int tag, string msg);
+
+    /**
+     *  Check if a message is received, e.g. STOP_TAG
+     */
+    bool checkMsg(int tag);
+
+    /**
+     *  Check if a message is received, e.g. STOP_TAG
+     */
+    bool checkMsg(int tag, string &msg);
+
+    /** wrapper for MPI_Send a string
+        @param str string to send
+        @param dest destination process
+        @param tag message tag
+    */
+
+#ifdef _IQTREE_MPI
+    void sendString(string &str, int dest, int tag);
+
+    /** wrapper for MPI_Recv a string
+        @param[out] str string received
+        @param src source process
+        @param tag message tag
+        @return the source process that sent the message
+    */
+    int recvString(string &str, int src = MPI_ANY_SOURCE, int tag = MPI_ANY_TAG);
+
+    /** wrapper for MPI_Send an entire Checkpoint object
+        @param ckp Checkpoint object to send
+        @param dest destination process
+    */
+    void sendCheckpoint(Checkpoint *ckp, int dest);
+
+    /** wrapper for MPI_Recv an entire Checkpoint object
+        @param[out] ckp Checkpoint object received
+        @param src source process
+        @param tag message tag
+        @return the source process that sent the message
+    */
+    int recvCheckpoint(Checkpoint *ckp, int src = MPI_ANY_SOURCE);
+
+    /**
+        wrapper for MPI_Bcast to broadcast checkpoint from Master to all Workers
+        @param ckp Checkpoint object
+    */
+    void broadcastCheckpoint(Checkpoint *ckp);
+
+    /**
+        wrapper for MPI_Gather to gather all checkpoints into Master
+        @param ckp Checkpoint object
+    */
+    void gatherCheckpoint(Checkpoint *ckp);
+#endif
+
+    void increaseTreeSent(int inc = 1) {
+        numTreeSent += inc;
+    }
+
+    void increaseTreeReceived(int inc = 1) {
+        numTreeReceived += inc;
+    }
+
+private:
+    /**
+    *  Remove the buffers for finished messages
+    */
+    int cleanUpMessages();
+
+private:
+    MPIHelper() { }; // Disable constructor
+    MPIHelper(MPIHelper const &) { }; // Disable copy constructor
+    void operator=(MPIHelper const &) { }; // Disable assignment
+
+    int processID;
+
+    int numProcesses;
+
+public:
+    int getNumTreeReceived() const {
+        return numTreeReceived;
+    }
+
+    void setNumTreeReceived(int numTreeReceived) {
+        MPIHelper::numTreeReceived = numTreeReceived;
+    }
+
+    int getNumTreeSent() const {
+        return numTreeSent;
+    }
+
+    void setNumTreeSent(int numTreeSent) {
+        MPIHelper::numTreeSent = numTreeSent;
+    }
+    
+    void resetNumbers() {
+        numTreeSent = 0;
+        numTreeReceived = 0;
+        numNNISearch = 0;
+    }
+
+private:
+    int numTreeSent;
+
+    int numTreeReceived;
+
+public:
+    int getNumNNISearch() const {
+        return numNNISearch;
+    }
+
+    void setNumNNISearch(int numNNISearch) {
+        MPIHelper::numNNISearch = numNNISearch;
+    }
+
+private:
+    int numNNISearch;
+
+#ifdef _IQTREE_MPI
+    // A list storing messages and the corresponding requests that have been sent from the current process.
+    // When a message has been successfully received, it will be deleted from the list
+    vector< pair<MPI_Request *, ObjectStream *> > sentMessages;
+#endif
+
+
+};
+
+#endif
diff --git a/ObjectStream.cpp b/ObjectStream.cpp
new file mode 100644
index 0000000..f06efea
--- /dev/null
+++ b/ObjectStream.cpp
@@ -0,0 +1,113 @@
+//
+// Created by tung on 6/23/15.
+//
+
+#include "ObjectStream.h"
+
+ObjectStream::ObjectStream(const char *data, size_t length) {
+    objectData = new char[length];
+    memcpy(objectData, data, length);
+    objectDataSize = length;
+}
+
+ObjectStream::ObjectStream(TreeCollection &trees) {
+    objectData = NULL;
+    objectDataSize = 0;
+    initFromTreeCollection(trees);
+}
+
+void ObjectStream::initFromTreeCollection(TreeCollection &trees) {
+    vector<string> treeStrings = trees.getTreeStrings();
+    vector<double> scores = trees.getScores();
+    vector<int> sourceProcID = trees.getSourceProcID();
+
+    char* stringData;
+    size_t stringDataSize = serializeStrings(treeStrings, stringData);
+    size_t doubleDataSize = scores.size() * sizeof(double);
+    size_t intDataSize = sourceProcID.size() * sizeof(int);
+
+    objectDataSize = sizeof(size_t) * 3 + stringDataSize + doubleDataSize + intDataSize;
+
+    if (objectData != NULL) {
+        delete[] objectData;
+    }
+    objectData = new char[objectDataSize];
+
+    char* pos = objectData;
+    // Copy the size of the string block and double block into the beginning of objectData
+    memcpy(pos, &stringDataSize, sizeof(size_t));
+    pos = pos + sizeof(size_t);
+    memcpy(pos, &doubleDataSize, sizeof(size_t));
+    pos = pos + sizeof(size_t);
+    memcpy(pos, &intDataSize, sizeof(size_t));
+    pos = pos + sizeof(size_t);
+
+    // Add string block and double block afterwards
+    memcpy(pos, stringData, stringDataSize);
+    pos = pos + stringDataSize;
+    
+    memcpy(pos, scores.data(), doubleDataSize);
+    pos = pos + doubleDataSize;
+    
+    memcpy(pos, sourceProcID.data(), intDataSize);
+
+    delete [] stringData;
+}
+
+TreeCollection ObjectStream::getTreeCollection() {
+    size_t metaInfo[3];
+    memcpy(metaInfo, objectData, sizeof(size_t) * 3);
+    size_t stringDataSize = metaInfo[0];
+    size_t doubleDataSize = metaInfo[1];
+    size_t intDataSize = metaInfo[2];
+    size_t numTrees = doubleDataSize / sizeof(double);
+    vector<string> treeStrings;
+    deserializeStrings(objectData + sizeof(size_t) * 3, stringDataSize, treeStrings);
+    assert(treeStrings.size() == numTrees);
+
+    double scoreArr[numTrees];
+    memcpy(scoreArr, objectData + sizeof(size_t) * 3 + stringDataSize, doubleDataSize);
+    vector<double> scores(scoreArr, scoreArr + sizeof(scoreArr) / sizeof(scoreArr[0]));
+
+    int sourceProcIDArr[numTrees];
+    memcpy(sourceProcIDArr, objectData + sizeof(size_t) * 3 + stringDataSize + doubleDataSize, intDataSize);
+    vector<int> sourceProcID(sourceProcIDArr, sourceProcIDArr + sizeof(sourceProcIDArr) / sizeof(sourceProcIDArr[0]));
+
+    TreeCollection decodedTrees(treeStrings, scores, sourceProcID);
+    return decodedTrees;
+}
+
+
+size_t ObjectStream::serializeStrings(vector<string> &strings, char *&data) {
+    size_t numStrings = strings.size();
+    size_t totalSize = 0;
+    // Determine the total bytes required
+    for (int i = 0; i < numStrings; i++) {
+        totalSize += strings[i].length() + 1;
+    }
+    data = new char[totalSize];
+    char* pos = data;
+    for (int i = 0; i < numStrings; i++) {
+        size_t length = strings[i].length();
+        const char* cString = strings[i].c_str();
+        strncpy(pos, cString, length + 1);
+        pos = pos + length + 1;
+    }
+    return totalSize;
+}
+
+void ObjectStream::deserializeStrings(char *data, size_t length, vector<string> &strings) {
+    strings.clear();
+    stringstream ss;
+    ss.str("");
+    for (int i = 0; i < length; i++) {
+        if (data[i] == '\0') {
+            strings.push_back(ss.str());
+            ss.str("");
+        } else {
+            ss << data[i];
+        }
+    }
+}
+
+
diff --git a/ObjectStream.h b/ObjectStream.h
new file mode 100644
index 0000000..af6f46d
--- /dev/null
+++ b/ObjectStream.h
@@ -0,0 +1,82 @@
+//
+// Created by tung on 6/23/15.
+//
+
+#ifndef IQTREE_OBJECTSTREAM_H
+#define IQTREE_OBJECTSTREAM_H
+#include "TreeCollection.h"
+
+/**
+ *  This class is used to serialize object. It converts different object to byte stream
+ *  and can also read in byte stream to reconstruct the object
+ */
+class ObjectStream {
+public:
+
+    /**
+     * Constructor
+     */
+    ObjectStream(const char* data, size_t length);
+
+    ObjectStream(TreeCollection& trees);
+
+    ObjectStream() {
+        objectData = NULL;
+    }
+
+    virtual ~ObjectStream() {
+        if (objectData != NULL)
+            delete [] objectData;
+    }
+
+    /**
+     *  Convert a tree collection into the internal byte stream
+     *  @param[IN] trees
+     */
+    void initFromTreeCollection(TreeCollection &trees);
+
+    /**
+     *  Reconstruct TreeCollection from a byte stream
+     */
+    TreeCollection getTreeCollection();
+
+
+public:
+    size_t getDataLength() const {
+        return objectDataSize;
+    }
+
+public:
+    char *getObjectData() const {
+        return objectData;
+    }
+
+private:
+    /**
+     *  Byte stream representing the object
+     */
+    char* objectData;
+
+    size_t objectDataSize;
+
+
+    /**
+     *  Convert vector of strings to array of chars
+     *  @param [IN] strings the vector strings
+     *  @param [OUT] the char array
+     *  @return size of the char array
+     */
+    size_t serializeStrings(vector<string> &strings, char *&data);
+
+    /**
+     *  Convert array of chars to vector of strings
+     *  @param [IN] data byte stream representing vector<string>
+     *  @param [IN] length size of data
+     *  @param [OUT] strings the reconstructed vector<string>
+     */
+    void deserializeStrings(char *data, size_t length, vector<string> &strings);
+
+};
+#endif // IQTREE_OBJECTSTREAM_H
+
+
diff --git a/README.md b/README.md
index 9f60b26..76274a6 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,71 @@
 IQ-TREE
--------
+=======
 
-Efficient phylogenetic software by maximum likelihood
+Efficient and versatile phylogenomic software by maximum likelihood <http://www.iqtree.org>
 
-Please see our github wiki for more information: <https://github.com/Cibiv/IQ-TREE/wiki>
+Introduction
+------------
+
+The IQ-TREE software was created as the successor of IQPNNI and [TREE-PUZZLE](http://www.tree-puzzle.de) (thus the name IQ-TREE). IQ-TREE was motivated by the rapid accumulation of phylogenomic data, leading to a need for efficient phylogenomic software that can handle a large amount of data and provide more complex models of sequence evolution. To this end, IQ-TREE can utilize multicore computers and distributed parallel computing to speed up the analysis. IQ-TREE automatically performs [...]
+
+As input IQ-TREE accepts all common sequence alignment formats including PHYLIP, FASTA, Nexus, Clustal and MSF. As output IQ-TREE will write a self-readable report file (name suffix `.iqtree`), a NEWICK tree file (`.treefile`)  which can be visualized by tree viewer programs such as [FigTree](http://tree.bio.ed.ac.uk/software/figtree/), [Dendroscope](http://dendroscope.org) or [iTOL](http://itol.embl.de).
+
+
+Key features of IQ-TREE
+-----------------------
+
+* __Efficient search algorithm__: Fast and effective stochastic algorithm to reconstruct phylogenetic trees by maximum likelihood. IQ-TREE compares favorably to RAxML and PhyML in terms of likelihood while requiring similar amount of computing time ([Nguyen et al., 2015]).
+* __Ultrafast bootstrap__: An ultrafast bootstrap approximation (UFBoot) to assess branch supports. UFBoot is 10 to 40 times faster than RAxML rapid bootstrap and obtains less biased support values ([Minh et al., 2013]).
+* __Ultrafast model selection__: An ultrafast and automatic model selection (ModelFinder) which is 10 to 100 times faster than jModelTest and ProtTest. ModelFinder also finds best-fit partitioning scheme like PartitionFinder.
+* __Phylogenetic testing__: Several fast branch tests like SH-aLRT and aBayes test ([Anisimova et al., 2011]) and tree topology tests like the approximately unbiased (AU) test ([Shimodaira, 2002]).
+
+
+The strength of IQ-TREE is the availability of a wide variety of phylogenetic models:
+
+* __Common models__: All [common substitution models](http://www.iqtree.org/doc/Substitution-Models) for DNA, protein, codon, binary and morphological data with [rate heterogeneity among sites](http://www.iqtree.org/doc/Substitution-Models/#rate-heterogeneity-across-sites) and [ascertainment bias correction](http://www.iqtree.org/doc/Substitution-Models/#ascertainment-bias-correction) for e.g. SNP data.
+* __[Partition models](http://www.iqtree.org/doc/Complex-Models/#partition-models)__: Allowing individual models for different genomic loci (e.g. genes or codon positions), mixed data types, mixed rate heterogeneity types, linked or unlinked branch lengths between partitions.
+* __Mixture Models__: [fully customizable mixture models](http://www.iqtree.org/doc/Complex-Models/#mixture-models) and [empirical protein mixture models](http://www.iqtree.org/doc/Substitution-Models/#protein-models) and.
+
+IQ-TREE web service
+-------------------
+
+For a quick start you can also try the IQ-TREE web server, which performs online computation using a dedicated computing cluster. It is very easy to use with as few as just 3 clicks! Try it out at
+
+<http://iqtree.cibiv.univie.ac.at>
+
+
+User support
+------------
+
+Please refer to the [user documentation](http://www.iqtree.org/doc/) and [frequently asked questions](http://www.iqtree.org/doc/Frequently-Asked-Questions). If you have further questions, feedback, feature requests, and bug reports, please sign up the following Google group (if not done yet) and post a topic to the 
+
+<https://groups.google.com/d/forum/iqtree>
+
+_The average response time is one working day._
+
+Citations
+---------
+
+To cite IQ-TREE please use:
+
+* L.-T. Nguyen, H.A. Schmidt, A. von Haeseler, and B.Q. Minh (2015) IQ-TREE: A fast and effective stochastic algorithm for estimating maximum likelihood phylogenies. *Mol. Biol. Evol.*, 32, 268-274. [DOI: 10.1093/molbev/msu300](http://dx.doi.org/10.1093/molbev/msu300)
+
+For the ultrafast bootstrap (UFBoot) please cite:
+
+* B.Q. Minh, M.A.T. Nguyen, and A. von Haeseler (2013) Ultrafast approximation for phylogenetic bootstrap. *Mol. Biol. Evol.*, 30:1188-1195. [DOI: 10.1093/molbev/mst024](http://dx.doi.org/10.1093/molbev/mst024)
+
+#### Credits and Acknowledgements
+
+Some parts of the code were taken from the following packages/libraries: [Phylogenetic likelihood library](http://www.libpll.org), [TREE-PUZZLE](http://www.tree-puzzle.de), 
+[BIONJ](http://dx.doi.org/10.1093/oxfordjournals.molbev.a025808), [Nexus Class Libary](http://dx.doi.org/10.1093/bioinformatics/btg319), [Eigen library](http://eigen.tuxfamily.org/),
+[SPRNG library](http://www.sprng.org), [Zlib library](http://www.zlib.net), gzstream library, [vectorclass library](http://www.agner.org/optimize/), [GNU scientific library](https://www.gnu.org/software/gsl/).
+
+
+IQ-TREE was partially funded by the [Austrian Science Fund - FWF](http://www.fwf.ac.at/) (grant no. I760-B17 from 2012-2015 and and I 2508-B29 from 2016-2019) and the [University of Vienna](https://www.univie.ac.at/) (Initiativkolleg I059-N).
+
+
+[Anisimova et al., 2011]: http://dx.doi.org/10.1093/sysbio/syr041
+[Guindon et al., 2010]: http://dx.doi.org/10.1093/sysbio/syq010
+[Minh et al., 2013]: http://dx.doi.org/10.1093/molbev/mst024
+[Nguyen et al., 2015]: http://dx.doi.org/10.1093/molbev/msu300
+[Shimodaira, 2002]: http://dx.doi.org/10.1080/10635150290069913
diff --git a/TreeCollection.cpp b/TreeCollection.cpp
new file mode 100644
index 0000000..26d702c
--- /dev/null
+++ b/TreeCollection.cpp
@@ -0,0 +1,56 @@
+//
+// Created by Tung Nguyen on 6/23/15.
+//
+
+#include "TreeCollection.h"
+#include "MPIHelper.h"
+
+using namespace std;
+
+TreeCollection::TreeCollection(vector<string>& trees, vector<double>& scores, vector<int> &sourceProcID) {
+    assert(trees.size() == scores.size());
+    this->treeStrings = trees;
+    this->scores = scores;
+    this->sourceProcID = sourceProcID;
+//    this->sourceProcID.clear();
+//    this->sourceProcID.insert(this->sourceProcID.end(), scores.size(), MPIHelper::getInstance().getProcessID());
+}
+
+pair<string, double> TreeCollection::getTree(int i) {
+    assert(treeStrings.size() == scores.size());
+    return std::make_pair(treeStrings[i], scores[i]);
+}
+
+void TreeCollection::clear() {
+    treeStrings.clear();
+    scores.clear();
+    sourceProcID.clear();
+}
+
+void TreeCollection::addTrees(TreeCollection &trees) {
+//    for (int i = 0; i < trees.getNumTrees(); i++) {
+//        treeStrings.push_back(trees.getTree(i).first);
+//        scores.push_back(trees.getTree(i).second);
+//        
+//    }
+    treeStrings.insert(treeStrings.end(), trees.treeStrings.begin(), trees.treeStrings.end());
+    scores.insert(scores.end(), trees.scores.begin(), trees.scores.end());
+    sourceProcID.insert(sourceProcID.end(), trees.sourceProcID.begin(), trees.sourceProcID.end());
+}
+
+void TreeCollection::addTrees(CandidateSet &candidateTrees) {
+    CandidateSet::reverse_iterator rit;
+    for (rit = candidateTrees.rbegin(); rit != candidateTrees.rend(); rit++) {
+       treeStrings.push_back(rit->second.tree);
+       scores.push_back(rit->first);
+       sourceProcID.push_back(MPIHelper::getInstance().getProcessID());
+    }
+}
+
+
+
+size_t TreeCollection::getNumTrees() {
+    size_t numTrees = treeStrings.size();
+    assert(numTrees == scores.size());
+    return numTrees;
+}
diff --git a/TreeCollection.h b/TreeCollection.h
new file mode 100644
index 0000000..aecf328
--- /dev/null
+++ b/TreeCollection.h
@@ -0,0 +1,63 @@
+//
+// Created by tung on 6/23/15.
+//
+
+#ifndef IQTREE_TREECOLLECTION_H
+#define IQTREE_TREECOLLECTION_H
+#include "candidateset.h"
+
+/**
+ *  A container for a set of trees together with their scores
+ */
+class TreeCollection {
+private:
+    vector<string> treeStrings;
+    vector<double> scores;
+    vector<int> sourceProcID;
+public:
+
+    /**
+     *  Constructor
+     */
+    TreeCollection() {};
+
+    TreeCollection(vector<string>& trees, vector<double>& scores, vector<int> &sourceProcID);
+
+    void addTrees(TreeCollection &trees);
+
+    void addTrees(CandidateSet& candidateTrees);
+
+
+    /*
+     *  Get i-th tree and its score
+    */
+    pair<string, double> getTree(int i);
+
+    void clear();
+
+    void setTreeStrings(const vector<string> treeStrings) {
+        TreeCollection::treeStrings = treeStrings;
+    }
+
+    void setScores(const vector<double> scores) {
+        TreeCollection::scores = scores;
+    }
+
+    size_t getNumTrees();
+
+    const vector<string> &getTreeStrings() const {
+        return treeStrings;
+    }
+
+    const vector<double> &getScores() const {
+        return scores;
+    }
+
+    const vector<int> &getSourceProcID() const {
+        return sourceProcID;
+    }
+
+};
+
+
+#endif //IQTREE_TREECOLLECTION_H
diff --git a/alignment.cpp b/alignment.cpp
index dcab0ff..f935d2b 100644
--- a/alignment.cpp
+++ b/alignment.cpp
@@ -790,6 +790,7 @@ void Alignment::orderPatternByNumChars() {
     }
     delete [] ptn_order;
     delete [] num_chars;
+//    cout << ordered_pattern.size() << " ordered_pattern" << endl;
 }
 
 void Alignment::ungroupSitePattern()
@@ -1882,7 +1883,7 @@ int Alignment::buildRetainingSites(const char *aln_site_list, IntVector &kept_si
     }
     if (exclude_const_sites) {
         for (j = 0; j < kept_sites.size(); j++)
-        	if (at(site_pattern[j]).isConst())
+        	if (at(site_pattern[j]).isInvariant())
         		kept_sites[j] = 0;
 
     }
@@ -1990,10 +1991,12 @@ void Alignment::extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_t
     site_pattern.resize(aln->getNSite(), -1);
     clear();
     pattern_index.clear();
-    int site = 0;
+    int site = 0, removed_sites = 0;
     VerboseMode save_mode = verbose_mode;
     verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
-    for (iterator pit = aln->begin(); pit != aln->end(); pit++) {
+//    for (iterator pit = aln->begin(); pit != aln->end(); pit++) {
+    for (site = 0; site < aln->getNSite(); site++) {
+        iterator pit = aln->begin() + (aln->getPatternID(site)); 
         Pattern pat;
         int true_char = 0;
         for (it = seq_id.begin(); it != seq_id.end(); it++) {
@@ -2001,12 +2004,14 @@ void Alignment::extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_t
             if (ch != STATE_UNKNOWN) true_char++;
             pat.push_back(ch);
         }
-        if (true_char < min_true_char) continue;
-        addPattern(pat, site, (*pit).frequency);
-        for (int i = 0; i < (*pit).frequency; i++)
-            site_pattern[site++] = size()-1;
+        if (true_char < min_true_char)
+            removed_sites++;
+        else
+            addPattern(pat, site-removed_sites);
+//        for (int i = 0; i < (*pit).frequency; i++)
+//            site_pattern[site++] = size()-1;
     }
-    site_pattern.resize(site);
+    site_pattern.resize(aln->getNSite() - removed_sites);
     verbose_mode = save_mode;
     countConstSite();
     buildSeqStates();
diff --git a/alignment.h b/alignment.h
index 781a378..eee6169 100644
--- a/alignment.h
+++ b/alignment.h
@@ -163,7 +163,7 @@ public:
 
     /** order pattern by number of character states and return in ptn_order
     */
-    void orderPatternByNumChars();
+    virtual void orderPatternByNumChars();
 
     /**
      * un-group site-patterns, i.e., making #sites = #patterns and pattern frequency = 1 for all patterns
@@ -579,6 +579,11 @@ public:
      */
     virtual double computeUnconstrainedLogL();
 
+    /**
+     * 	@return number of states, if it is a partition model, return max num_states across all partitions
+     */
+    virtual int getMaxNumStates() { return num_states; }
+
     /** either SEQ_BINARY, SEQ_DNA, SEQ_PROTEIN, SEQ_MORPH, or SEQ_CODON */
     SeqType seq_type;
 
diff --git a/candidateset.cpp b/candidateset.cpp
index fa5ea81..f6a5fc8 100644
--- a/candidateset.cpp
+++ b/candidateset.cpp
@@ -2,30 +2,40 @@
  * candidateset.cpp
  *
  *  Created on: Jun 1, 2014
- *      Author: Tung Nguyen
+ *  Author: Tung Nguyen
+ *  Email: nltung at gmail.com
  */
 
-#include "phylotree.h"
+#include "iqtree.h"
 #include "candidateset.h"
+#include "MPIHelper.h"
 
-void CandidateSet::init(Alignment* aln, Params *params) {
+void CandidateSet::init(Alignment *aln, int maxSize) {
     this->aln = aln;
-    this->params = params;
+    this->maxSize = maxSize;
 }
 
 CandidateSet::~CandidateSet() {
 }
 
 CandidateSet::CandidateSet() : CheckpointFactory() {
-	aln = NULL;
-	params = NULL;
+    aln = NULL;
+    numStableSplits = 0;
+    this->maxSize = Params::getInstance().maxCandidates;
+}
+
+void CandidateSet::initTrees(CandidateSet& candSet) {
+    int curMaxSize = this->maxSize;
+    *this = candSet;
+    setMaxSize(curMaxSize);
 }
 
 
+
 void CandidateSet::saveCheckpoint() {
     checkpoint->startStruct("CandidateSet");
-	int ntrees = min(params->numNNITrees, (int)size());
-    checkpoint->startList(params->numNNITrees);
+    int ntrees = min(Params::getInstance().numNNITrees, (int) size());
+    checkpoint->startList(Params::getInstance().numNNITrees);
     for (reverse_iterator it = rbegin(); it != rend() && ntrees > 0; it++, ntrees--) {
         checkpoint->addListElement();
         stringstream ss;
@@ -46,8 +56,8 @@ void CandidateSet::restoreCheckpoint() {
     checkpoint->startStruct("CandidateSet");
     double score;
     string tree;
-    checkpoint->startList(params->numNNITrees);
-    for (int i = 0; i < params->numNNITrees; i++) {
+    checkpoint->startList(Params::getInstance().numNNITrees);
+    for (int i = 0; i < Params::getInstance().numNNITrees; i++) {
         checkpoint->addListElement();
         string str;
         if (!checkpoint->getString("", str)) {
@@ -57,68 +67,84 @@ void CandidateSet::restoreCheckpoint() {
         ss >> score >> tree;
 //        CKP_RESTORE(tree);
         update(tree, score);
-        
+
     }
     checkpoint->endList();
     checkpoint->endStruct();
 }
 
 
-vector<string> CandidateSet::getBestTrees() {
-	vector<string> res;
-	double bestScore = rbegin()->first;
-	for (reverse_iterator rit = rbegin(); rit != rend() && rit->second.score == bestScore; rit++) {
-		res.push_back(rit->second.tree);
-	}
-	return res;
-}
-
-string CandidateSet::getRandCandTree() {
-	assert(!empty());
-	if (empty())
-		return "";
-	int id = random_int(min(params->popSize, (int)size()) );
-	for (reverse_iterator i = rbegin(); i != rend(); i++, id--)
-		if (id == 0)
-			return i->second.tree;
-	assert(0);
-	return "";
-}
-
-vector<string> CandidateSet::getTopTrees(int numTree) {
-	assert(numTree <= params->maxCandidates);
-	if (numTree == 0) {
-		numTree = params->maxCandidates;
-	}
-	vector<string> res;
-	int cnt = numTree;
-	for (reverse_iterator rit = rbegin(); rit != rend() && cnt > 0; rit++, cnt--) {
-		res.push_back(rit->second.tree);
-	}
-	return res;
-}
-
-vector<string> CandidateSet::getBestLocalOptimalTrees(int numTree) {
-	assert(numTree <= params->maxCandidates);
-	if (numTree == 0) {
-		numTree = params->maxCandidates;
-	}
-	vector<string> res;
-	int cnt = numTree;
-	for (reverse_iterator rit = rbegin(); rit != rend() && cnt > 0; rit++) {
-		if (rit->second.localOpt) {
-			res.push_back(rit->second.tree);
-			cnt--;
-		}
-	}
-	return res;
+string CandidateSet::getRandTopTree(int numTopTrees) {
+    assert(!empty());
+    if (empty())
+        return "";
+    int id = random_int(min(numTopTrees, (int) size()));
+    for (reverse_iterator it = rbegin(); it != rend(); it++) {
+        if (id == 0)
+            return it->second.tree;
+        id--;
+    }
+    assert(0);
+    return "";
+}
+
+vector<string> CandidateSet::getBestTreeStrings(int numTree) {
+    if (numTree == 0 || numTree > maxSize) {
+        numTree = maxSize;
+    }
+    vector<string> res;
+    int cnt = numTree;
+    for (reverse_iterator rit = rbegin(); rit != rend() && cnt > 0; rit++, cnt--) {
+        res.push_back(rit->second.tree);
+    }
+    return res;
 }
+
+vector<string> CandidateSet::getBestTreeStringsForProcess(int numTree) {
+    int numProc = MPIHelper::getInstance().getNumProcesses();
+    int procID = MPIHelper::getInstance().getProcessID();
+
+    if (numTree < numProc)
+        numTree = numProc; // BUG FIX: make sure that each process gets at least 1 tree
+
+    vector<string> alltrees = getBestTreeStrings(numTree);
+    if (numProc == 1) return alltrees;
+    
+    if (numTree == 0 || numTree > alltrees.size()) {
+        numTree = alltrees.size();
+    }
+    int cnt = 0;
+    vector<string> res;
+    // process will get trees indexed procID, procID+1*numProc, procID+2*numProc,...
+    for (cnt = procID; cnt < numTree; cnt+=numProc) {
+        res.push_back(alltrees[cnt]);
+    }
+    return res;
+}
+
+
+//vector<string> CandidateSet::getBestLocalOptimalTrees(int numTree) {
+//	assert(numTree <= params->maxPopSize);
+//	if (numTree == 0) {
+//		numTree = params->maxPopSize;
+//	}
+//	vector<string> res;
+//	int cnt = numTree;
+//	for (reverse_iterator rit = rbegin(); rit != rend() && cnt > 0; rit++) {
+//		if (rit->second.localOpt) {
+//			res.push_back(rit->second.tree);
+//			cnt--;
+//		}
+//	}
+//	return res;
+//}
+
 /*
 bool CandidateSet::replaceTree(string tree, double score) {
     CandidateTree candidate;
     candidate.tree = tree;
     candidate.score = score;
-    candidate.topology = getTopology(tree);
+    candidate.topology = getTopologyString(tree);
     if (treeTopologyExist(candidate.topology)) {
         topologies[candidate.topology] = score;
         for (reverse_iterator i = rbegin(); i != rend(); i++) {
@@ -133,6 +159,53 @@ bool CandidateSet::replaceTree(string tree, double score) {
     }
     return true;
 }
+*/
+
+
+void CandidateSet::addCandidateSplits(string treeString) {
+    vector<string> taxaNames = aln->getSeqNames();
+    MTree tree(treeString, taxaNames, Params::getInstance().is_rooted);
+    SplitGraph allSplits;
+    tree.convertSplits(allSplits);
+    for (SplitGraph::iterator splitIt = allSplits.begin(); splitIt != allSplits.end(); splitIt++) {
+        int value;
+        Split *sp = candSplits.findSplit(*splitIt, value);
+        if (sp != NULL) {
+            sp->setWeight(value + 1);
+            candSplits.setValue(sp, value + 1);
+        } else {
+            sp = new Split(*(*splitIt));
+            sp->setWeight(1);
+            candSplits.insertSplit(sp, 1);
+        }
+    }
+    candSplits.setNumTree(candSplits.getNumTree() + 1);
+}
+
+void CandidateSet::removeCandidateSplits(string treeString) {
+    vector<string> taxaNames = aln->getSeqNames();
+    MTree tree(treeString, taxaNames, Params::getInstance().is_rooted);
+    SplitGraph allSplits;
+    tree.convertSplits(allSplits);
+    for (SplitGraph::iterator splitIt = allSplits.begin(); splitIt != allSplits.end(); splitIt++) {
+        int value = 0;
+        Split *sp;
+        sp = candSplits.findSplit(*splitIt, value);
+        if (value == 0) {
+            cout << "Cannot find split: ";
+            (*splitIt)->report(cout);
+            exit(1);
+        } else {
+            assert(sp->getWeight() >= 1);
+            if (sp->getWeight() > 1) {
+                sp->setWeight(value - 1);
+            } else {
+                candSplits.eraseSplit(*splitIt);
+            }
+        }
+    }
+    candSplits.setNumTree(candSplits.getNumTree() - 1);
+}
 
 string CandidateSet::getNextCandTree() {
     string tree;
@@ -147,78 +220,84 @@ string CandidateSet::getNextCandTree() {
 
 void CandidateSet::initParentTrees() {
     if (parentTrees.empty()) {
-        int count = params->popSize;
-        for (reverse_iterator i = rbegin(); i != rend() && count >0 ; i++, count--) {
+        int count = Params::getInstance().popSize;
+        for (reverse_iterator i = rbegin(); i != rend() && count > 0; i++, count--) {
             parentTrees.push(i->second.tree);
             //cout << i->first << endl;
         }
     }
 }
-*/
-bool CandidateSet::update(string tree, double score, bool localOpt) {
-	bool newTree = true;
-	CandidateTree candidate;
-	candidate.score = score;
-	candidate.topology = getTopology(tree);
-	candidate.localOpt = localOpt;
-//	cout << "Updating candidate tree " << tree << endl;
-	candidate.tree = tree;
-
-	if (treeTopologyExist(candidate.topology)) {
-		newTree = false;
-	    /* If tree topology already exist but the score is better, we replace the old one
-	    by the new one (with new branch lengths) and update the score */
-		if (topologies[candidate.topology] < score) {
-			removeCandidateTree(candidate.topology);
-			topologies[candidate.topology] = score;
-			// insert tree into candidate set
-			insert(CandidateSet::value_type(score, candidate));
-		} else if (candidate.localOpt) {
-			CandidateSet::iterator treePtr = getCandidateTree(candidate.topology);
-			treePtr->second.localOpt = candidate.localOpt;
-		}
-	} else {
-		if (getWorstScore() < score && size() >= params->maxCandidates) {
-			// remove the worst-scoring tree
-			topologies.erase(begin()->second.topology);
-			erase(begin());
-		}
-		CandidateSet::iterator it = insert(CandidateSet::value_type(score, candidate));
-		topologies[candidate.topology] = score;
-		if (params->fix_stable_splits && getNumLocalOptTrees() >= params->numSupportTrees) {
-			int it_pos = distance(it, end());
-			// The new tree is one of the numSupportTrees best trees.
-			// Thus recompute supported splits
-			if (it_pos <= params->numSupportTrees) {
-				int nSupportedSplits = computeSplitSupport(params->numSupportTrees);
-				cout << ((double) nSupportedSplits / (aln->getNSeq() - 3)) * 100
-						<< " % of the splits have 100% support and can be fixed." << endl;
-			}
-		}
-	}
-	assert(topologies.size() == size());
-	return newTree;
+
+
+int CandidateSet::update(string newTree, double newScore) {
+    // Do not update candidate set if the new tree has worse score than the
+    // worst tree in the candidate set
+    if (newScore < begin()->first && size() >= maxSize) {
+        return -2;
+    }
+    CandidateTree candidate;
+    candidate.score = newScore;
+    candidate.topology = convertTreeString(newTree);
+    candidate.tree = newTree;
+
+    int treePos;
+    CandidateSet::iterator candidateTreeIt;
+
+    if (treeTopologyExist(candidate.topology)) {
+        // update new score if it is better the old score
+        double oldScore = topologies[candidate.topology];
+        if (oldScore < newScore) {
+            removeCandidateTree(candidate.topology);
+            insert(CandidateSet::value_type(newScore, candidate));
+            topologies[candidate.topology] = newScore;
+        }
+        assert(topologies.size() == size());
+        return -1;
+    }
+
+    candidateTreeIt = insert(CandidateSet::value_type(newScore, candidate));
+    topologies[candidate.topology] = newScore;
+
+    if (size() > maxSize) {
+        removeWorstTree();
+    }
+    assert(topologies.size() == size());
+
+    treePos = distance(candidateTreeIt, end());
+
+    return treePos;
 }
 
 vector<double> CandidateSet::getBestScores(int numBestScore) {
-	if (numBestScore == 0)
-		numBestScore = size();
-	vector<double> res;
-	for (reverse_iterator rit = rbegin(); rit != rend() && numBestScore > 0; rit++, numBestScore--) {
-		res.push_back(rit->first);
-	}
-	return res;
+    if (numBestScore == 0)
+        numBestScore = size();
+    vector<double> res;
+    for (reverse_iterator rit = rbegin(); rit != rend() && numBestScore > 0; rit++, numBestScore--) {
+        res.push_back(rit->first);
+    }
+    return res;
 }
 
 double CandidateSet::getBestScore() {
-	if (size() == 0)
-		return -DBL_MAX;
-	else
-		return rbegin()->first;
+    if (size() == 0)
+        return -DBL_MAX;
+    else
+        return rbegin()->first;
 }
 
-double CandidateSet::getWorstScore() {
-	return begin()->first;
+string CandidateSet::convertTreeString(string treeString, int format) {
+    MTree mtree;
+    stringstream str;
+    str << treeString;
+    str.seekg(0, ios::beg);
+    mtree.readTree(str, Params::getInstance().is_rooted);
+    mtree.assignLeafID();
+    string rootName = "0";
+    mtree.root = mtree.findLeafName(rootName);
+
+    ostringstream ostr;
+    mtree.printTree(ostr, format);
+    return ostr.str();
 }
 
 string CandidateSet::getTopology(string tree) {
@@ -227,119 +306,220 @@ string CandidateSet::getTopology(string tree) {
 //	mtree.aln = this->aln;
 //	mtree.setParams(params);
     MTree mtree;
-    
-	stringstream str;
-	str << tree;
-	str.seekg(0, ios::beg);
+
+    stringstream str;
+    str << tree;
+    str.seekg(0, ios::beg);
 //	freeNode();
-	mtree.readTree(str, params->is_rooted);
+    mtree.readTree(str, Params::getInstance().is_rooted);
 //	mtree.setAlignment(aln);
 //	mtree.setRootNode(params->root);
     mtree.assignLeafID();
     string x = "0";
     mtree.root = mtree.findLeafName(x);
 
-//	mtree.readTreeString(tree);
-//	mtree.setRootNode(params->root);
-
-	ostringstream ostr;
-	mtree.printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
-	return ostr.str();
+    ostringstream ostr;
+    mtree.printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
+    return ostr.str();
 }
 
 double CandidateSet::getTopologyScore(string topology) {
-	assert(topologies.find(topology) != topologies.end());
-	return topologies[topology];
+    assert(topologies.find(topology) != topologies.end());
+    return topologies[topology];
 }
 
 void CandidateSet::clear() {
-	multimap<double, CandidateTree>::clear();
-	clearTopologies();
+    multimap<double, CandidateTree>::clear();
+    clearTopologies();
 }
 
 void CandidateSet::clearTopologies() {
-	topologies.clear();
+    topologies.clear();
 }
 
 
 CandidateSet CandidateSet::getBestCandidateTrees(int numTrees) {
-	CandidateSet res;
-	if (numTrees >= size())
-		numTrees = size();
-	for (reverse_iterator rit = rbegin(); rit != rend() && numTrees > 0; rit++, numTrees--) {
-		res.insert(*rit);
-	}
-	return res;
+    CandidateSet res;
+    if (numTrees >= size() || numTrees == 0)
+        numTrees = (int) size();
+
+    for (reverse_iterator rit = rbegin(); rit != rend() && numTrees > 0; rit++, numTrees--) {
+        res.insert(*rit);
+    }
+    return res;
+}
+
+void CandidateSet::getAllTrees(vector<string> &trees, vector<double> &scores, int format) {
+    trees.clear();
+    scores.clear();
+
+    for (reverse_iterator rit = rbegin(); rit != rend(); rit++) {
+        if (format != -1) {
+            trees.push_back(convertTreeString(rit->second.tree, format));
+        } else {
+            trees.push_back(rit->second.tree);
+        }
+        scores.push_back(rit->first);
+    }
 }
 
 bool CandidateSet::treeTopologyExist(string topo) {
-	return (topologies.find(topo) != topologies.end());
+    return (topologies.find(topo) != topologies.end());
 }
 
 bool CandidateSet::treeExist(string tree) {
-	return treeTopologyExist(getTopology(tree));
+    return treeTopologyExist(convertTreeString(tree));
 }
 
 CandidateSet::iterator CandidateSet::getCandidateTree(string topology) {
-	for (CandidateSet::reverse_iterator rit = rbegin(); rit != rend(); rit++) {
-		if (rit->second.topology == topology)
-			return --(rit.base());
-	}
-	return end();
+    for (CandidateSet::reverse_iterator rit = rbegin(); rit != rend(); rit++) {
+        if (rit->second.topology == topology)
+            return --(rit.base());
+    }
+    return end();
 }
 
 void CandidateSet::removeCandidateTree(string topology) {
-	bool removed = false;
-	for (CandidateSet::reverse_iterator rit = rbegin(); rit != rend(); rit++) {
-			if (rit->second.topology == topology) {
-				erase( --(rit.base()) );
-				topologies.erase(topology);
-				removed = true;
-				break;
-			}
-	}
-	assert(removed);
-}
-
-bool CandidateSet::isStableSplit(Split& sp) {
-	return stableSplit.containSplit(sp);
-}
-
-int CandidateSet::computeSplitSupport(int numTree) {
-	stableSplit.clear();
-	if (numTree == 0)
-		numTree = getNumLocalOptTrees();
-	SplitIntMap hash_ss;
-	SplitGraph sg;
-	MTreeSet boot_trees;
-	int numMaxSupport = 0;
-	vector<string> trees = getBestLocalOptimalTrees(numTree);
-	assert(trees.size() > 1);
-	int maxSupport = trees.size();
-	boot_trees.init(trees, aln->getSeqNames(), params->is_rooted);
-	boot_trees.convertSplits(aln->getSeqNames(), sg, hash_ss, SW_COUNT, -1, NULL, false);
-
-	for (SplitIntMap::iterator it = hash_ss.begin(); it != hash_ss.end(); it++) {
-		if (it->second == maxSupport && it->first->countTaxa() > 1) {
-			numMaxSupport++;
-			Split* supportedSplit = new Split(*(it->first));
-			stableSplit.push_back(supportedSplit);
-		}
-	}
-	//cout << "Number of supported splits = " << numMaxSupport << endl;
-	return numMaxSupport;
-}
-
-void CandidateSet::setAln(Alignment* aln) {
-	this->aln = aln;
-}
-
-int CandidateSet::getNumLocalOptTrees() {
-	int numLocalOptima = 0;
-	for (reverse_iterator rit = rbegin(); rit != rend(); rit++) {
-		if (rit->second.localOpt) {
-			numLocalOptima++;
-		}
-	}
-	return numLocalOptima;
+    bool removed = false;
+    double treeScore;
+    // Find the score of the topology
+    treeScore = topologies[topology];
+    // Remove the topology
+    topologies.erase(topology);
+    pair<CandidateSet::iterator, CandidateSet::iterator> treeItPair;
+    // Find all trees with that score
+    treeItPair = equal_range(treeScore);
+    CandidateSet::iterator it;
+    for (it = treeItPair.first; it != treeItPair.second; ++it) {
+        if (it->second.topology == topology) {
+            erase(it);
+            removed = true;
+            break;
+        }
+    }
+    assert(removed);
+}
+
+
+void CandidateSet::removeWorstTree() {
+    topologies.erase(begin()->second.topology);
+    erase(begin());
+}
+
+int CandidateSet::computeSplitOccurences(double supportThreshold) {
+    candSplits.clear();
+    candSplits.setNumTree(size());
+
+    /* Store all splits in the best trees in candSplits.
+     * The variable numTree in SpitInMap is the number of trees, from which the splits are converted.
+     */
+    CandidateSet::iterator treeIt;
+    //vector<string> taxaNames = aln->getSeqNames();
+    for (treeIt = begin(); treeIt != end(); treeIt++) {
+        MTree tree(treeIt->second.tree, Params::getInstance().is_rooted);
+        SplitGraph splits;
+        tree.convertSplits(splits);
+        SplitGraph::iterator itg;
+        for (itg = splits.begin(); itg != splits.end(); itg++) {
+            int value;
+            Split *sp = candSplits.findSplit(*itg, value);
+            if (sp != NULL) {
+                int newHashWeight = value + 1;
+                double newSupport = (double) newHashWeight / (double) candSplits.getNumTree();
+                sp->setWeight(newSupport);
+                candSplits.setValue(sp, newHashWeight);
+            }
+            else {
+                sp = new Split(*(*itg));
+                sp->setWeight(1.0 / (double) candSplits.getNumTree());
+                candSplits.insertSplit(sp, 1);
+            }
+        }
+    }
+    int newNumStableSplits = countStableSplits(supportThreshold);
+    if (verbose_mode >= VB_MED) {
+        cout << ((double) newNumStableSplits / (aln->getNSeq() - 3)) * 100;
+        cout << " % of the splits are stable (support threshold " << supportThreshold;
+        cout << " from " << candSplits.getNumTree() << " trees)" << endl;
+    }
+
+    return numStableSplits;
+}
+
+int CandidateSet::countStableSplits(double thresHold) {
+    if (thresHold >= 1.0)
+        thresHold = 0.99;
+    if (candSplits.empty())
+        return 0;
+    int numMaxSupport = 0;
+    for (SplitIntMap::iterator it = candSplits.begin(); it != candSplits.end(); it++) {
+        if (it->first->getWeight() >= thresHold && it->first->countTaxa() > 1) {
+            //cout << "Stable support: " << it->first->getWeight() << endl;
+            numMaxSupport++;
+        }
+    }
+    return numMaxSupport;
+}
+
+void CandidateSet::reportStableSplits() {
+    if (candSplits.empty()) {
+        cout << "The set of stable splits is empty! " << endl;
+        return;
+    }
+
+//    int numMaxSupport = 0;
+    for (SplitIntMap::iterator it = candSplits.begin(); it != candSplits.end(); it++) {
+        if (it->second == candSplits.getNumTree() && it->first->countTaxa() > 1) {
+            cout << it->first->getWeight() << " / " << candSplits.getNumTree() << endl;
+            assert(it->first->getWeight() == candSplits.getNumTree());
+            it->first->report(cout);
+        }
+    }
+}
+
+void CandidateSet::setAln(Alignment *aln) {
+    this->aln = aln;
 }
+
+CandidateSet CandidateSet::getCandidateTrees(double score) {
+    CandidateSet res;
+    for (CandidateSet::iterator it = begin(); it != end(); it++) {
+        if (abs(it->first - score) < 0.1) {
+            res.insert(*it);
+        }
+    }
+    return res;
+}
+
+void CandidateSet::printTrees(string suffix) {
+    ofstream outTrees, outLHs;
+    string outTreesFile = string(Params::getInstance().out_prefix) + "." + suffix;
+    string outLHsFile = string(Params::getInstance().out_prefix) + "." + suffix + "_lh";
+    outTrees.open(outTreesFile.c_str());
+    outLHs.open(outLHsFile.c_str());
+    outLHs.precision(15);
+    for (reverse_iterator rit = rbegin(); rit != rend(); rit++) {
+        outLHs << rit->first << endl;
+        outTrees << rit->second.topology << endl;
+    }
+    outTrees.close();
+    outLHs.close();
+}
+
+void CandidateSet::recomputeLoglOfAllTrees(IQTree &treeObject) {
+    vector<string> allTreeStrings = getBestTreeStrings();
+    for (vector<string>:: iterator it = allTreeStrings.begin(); it != allTreeStrings.end(); it++) {
+        treeObject.readTreeString(*it);
+        double score = treeObject.optimizeAllBranches(1);
+        update(treeObject.getTreeString(), score);
+    }
+}
+
+
+
+
+
+
+
+
+
diff --git a/candidateset.h b/candidateset.h
index 48e9784..a7f3279 100644
--- a/candidateset.h
+++ b/candidateset.h
@@ -1,18 +1,37 @@
-/*
- * candidateset.h
- *
- *  Created on: Jun 1, 2014
- *      Author: Tung Nguyen
- */
+/***************************************************************************
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
 
 #ifndef CANDIDATESET_H_
 #define CANDIDATESET_H_
+//#include "phylotree.h"
 #include "tools.h"
 #include "alignment.h"
 #include "mtreeset.h"
 #include <stack>
 #include "checkpoint.h"
 
+
+class IQTree;
+
 struct CandidateTree {
 
 	/**
@@ -21,7 +40,6 @@ struct CandidateTree {
 	 */
 	string tree;
 
-
 	/**
 	 * tree topology WITHOUT branch lengths
 	 * and WITH TAXON ID (instead of taxon names)
@@ -33,15 +51,6 @@ struct CandidateTree {
 	 * log-likelihood or parsimony score
 	 */
 	double score;
-
-	/**
-	 *  Indicate whether the tree is NNI locally optimal.
-	 *  The reason to have this variable is that if the -reduction is
-	 *  enabled, we will also store non-locally optimal trees in the set.
-	 *  This is done to identify trees that belong to the same basin of attraction
-	 */
-	bool localOpt;
-
 };
 
 
@@ -51,12 +60,21 @@ struct CandidateTree {
 class CandidateSet : public multimap<double, CandidateTree>, public CheckpointFactory {
 
 public:
+
     /**
      * Initialization
      */
-	void init(Alignment* aln, Params *params);
+	void init(Alignment* aln, int maxSize);
+
+    CandidateSet();
 
-	CandidateSet();
+	CandidateSet(int maxSize);
+
+    /**
+     *  Replace the current candidate trees by those in another candidate set
+     *  @param candSet the candidate set whose trees will be copied over
+     */
+    void initTrees(CandidateSet& candSet);
 
     /**
         save object into the checkpoint
@@ -69,9 +87,10 @@ public:
     virtual void restoreCheckpoint();
 
     /**
-     * return randomly one candidate tree from max_candidate
+     * return randomly one of the current best trees
+     * @param numTopTrees [IN] Number of current best trees, from which a random tree is chosen.
      */
-    string getRandCandTree();
+    string getRandTopTree(int numTopTrees);
 
     /**
      * return the next parent tree for reproduction.
@@ -79,7 +98,7 @@ public:
      * been used for reproduction. If all candidate trees have been used, we select the
      * current best trees as the new parent trees
      */
-//    string getNextCandTree();
+    string getNextCandTree();
 
     /**
      *  Replace an existing tree in the candidate set
@@ -92,21 +111,21 @@ public:
     /**
      *  create the parent tree set containing top trees
      */
-//    void initParentTrees();
+    void initParentTrees();
 
     /**
-     * update/insert \a tree into the candidate set if its score is higher than the worst tree
-     *
-     * @param tree
-     * 	The new tree string (with branch lengths)
-     * @param score
-     * 	The score (ML or parsimony) of \a tree
-     * @param localOpt
-     * 	Tells whether \a tree is a locally optimal (DEFAULT: true)
-     * @return false if tree topology already exists
+     *  update/insert \a tree into the candidate set if its score is higher than the worst tree
      *
+     *  @param tree
+     * 	    The new tree string (with branch lengths)
+     *  @param score
+     * 	    The score (ML or parsimony) of \a tree
+     *  @return
+     *      Relative position of the new tree to the current best tree.
+     *      Return -1 if the tree topology already existed
+     *      Return -2 if the candidate set is not updated
      */
-    bool update(string tree, double score, bool localOpt = true);
+    int update(string newTree, double newScore);
 
     /**
      *  Get the \a numBestScores best scores in the candidate set
@@ -119,13 +138,6 @@ public:
     vector<double> getBestScores(int numBestScores = 0);
 
     /**
-     * Get the worst score
-     *
-     * @return the worst score
-     */
-    double getWorstScore();
-
-    /**
      * Get best score
      *
      * @return the best score
@@ -140,24 +152,26 @@ public:
      *  @return
      *  	Vector of current best trees
      */
-    vector<string> getTopTrees(int numTree = 0);
+    vector<string> getBestTreeStrings(int numTree = 0);
 
     /**
-     * 	Get \a numTree best locally optimal trees
-     * 	@param numTree
-     * 		Number of locally optimal trees
-     * 	@return
-     * 		Vector of current best locally optimal trees
+     *  Get \a numTree top scoring trees for this MPI process. Also work for sequential version.
+     *
+     *  @param numTree
+     *  	Number of top scoring trees
+     *  @return
+     *  	Vector of current best trees
      */
-    vector<string> getBestLocalOptimalTrees(int numTree = 0);
+    vector<string> getBestTreeStringsForProcess(int totalNumTree);
 
     /**
-     * 	Get tree(s) with the best score. There could be more than one
-     * 	tree that share the best score (this happens frequently with parsimony)
-     * 	@return
-     * 		A vector containing trees with the best score
+     *  Return a set of trees and a set of scores
+     *
+     *  @param trees vector of trees
+     *  @param scores vector of tree scores
+     *  @param treeFormat the NEWICK format used for tree string (WT_TAXON_ID, WT_BR_LEN, ..)
      */
-    vector<string> getBestTrees();
+    void getAllTrees(vector<string> &trees, vector<double> &scores, int treeFormat = -1);
 
     /**
      * destructor
@@ -186,12 +200,25 @@ public:
      *
      * 	@param tree
      * 		The newick tree string, from which the topology string will be generated
+     * 	@param convertOption
+     * 	    Use the same options as printTree() (WT_ID, WT_BR_LEN, ...)
      * 	@return
      * 		Newick string of the tree topology
      */
-    string getTopology(string tree);
+    string convertTreeString(const string tree, int format = WT_TAXON_ID | WT_SORT_TAXA);
 
     /**
+     * 	Return a unique topology (sorted by taxon names, rooted at taxon with alphabetically smallest name)
+     * 	without branch lengths
+     *
+     * 	@param tree
+     * 		The newick tree string, from which the topology string will be generated
+     * 	@return
+     * 		Newick string of the tree topology
+     */
+    string getTopology(string tree);
+    
+    /**
      * return the score of \a topology
      *
      * @param topology
@@ -212,18 +239,36 @@ public:
     void clearTopologies();
 
     /**
-     * Compute the split support from the \a numTree best local optimal trees in the candidate sets
-     * @param numTree the number of best trees used to calculate support values
-     * @return number of splits with 100% support value
+     *  Collect all splits from the set of current best trees and compute for each of them the number of occurances.
+     *
+     *  @param supportThres
+     *      a number in (0,1] representing the support value threshold for stable splits
+     *  @return number of splits with 100% support value
      */
-    int computeSplitSupport(int numTree = 0);
+    int computeSplitOccurences(double supportThres);
+
+   /**
+    *   Get number of stable splits
+    *   @param thresHold A number between (0,1.0], all splits have support values above this threshold
+    *   are considered stable
+    */
+    int countStableSplits(double thresHold);
+
+    void reportStableSplits();
 
     /**
-     * Check whether the
-     * @param sp the split to check, must have the same taxon set as the trees in CandidateSet.
-     * @return true if the \a supportedSplits contain \a sp, false otherwise.
+     *  Update the set of stable split when a new tree is inserted
+     *  to the set of best trees used for computing stable splits.
+     *
+     *  This function will remove all splits that belong to oldTree and add all
+     *  splits of newTree
+     *
+     *  @param
+     *  	oldTree tree that will be replace by \a newTree
+     *  @param
+     *  	newTree the new tree
      */
-    bool isStableSplit(Split& sp);
+    void updateStableSplit(string oldTree, string newTree);
 
     /**
      * Return a pointer to the \a CandidateTree that has topology equal to \a topology
@@ -233,50 +278,104 @@ public:
     iterator getCandidateTree(string topology);
 
     /**
-     * Remove the \a CandidateTree with topology equal to \a topology
+     * Remove candidate trees with topology equal to the specified topology
      * @param topology
      */
     void removeCandidateTree(string topology);
 
+    /**
+     *  Remove the worst tree in the candidate set
+     */
+    void removeWorstTree();
+
     /* Getter and Setter function */
 	void setAln(Alignment* aln);
-	int getMaxCandidates() const;
-	void setMaxCandidates(int maxCandidates);
-	int getPopSize() const;
-	void setPopSize(int popSize);
-	void setIsRooted(bool isRooted);
+
 	const StringDoubleHashMap& getTopologies() const {
 		return topologies;
 	}
 
-	/**
-	 * get number of locally optimal trees in the set
-	 * @return
-	 */
-	int getNumLocalOptTrees();
-
     /**
-     * Return a CandidateSet containing \a numTrees of current best candidate trees
+     * Return a CandidateSet containing \a numTrees candidate trees
      * @param numTrees
      * @return
      */
-    CandidateSet getBestCandidateTrees(int numTrees);
+    CandidateSet getBestCandidateTrees(int numTrees = 0);
 
-	SplitGraph& getStableSplits() {
-		return stableSplit;
+    /**
+     *  Return a set of trees whose score are equal \a score
+     */
+    CandidateSet getCandidateTrees(double score);
+
+
+	SplitIntMap& getCandSplits() {
+		return candSplits;
 	}
 
+	/**
+	 * @brief Get a random subset containing \a numSplit from the
+	 * set of stable splits.
+	 * @param
+	 * 		numSplit size of the subset
+	 * @param
+	 * 		splits (OUT) a random subset of the stable splits
+	 */
+	//void getRandomStableSplits(int numSplit, SplitGraph& splits);
+
+	/**
+	 *  Add splits from \a treeString to the current candidate splits
+	 *
+	 *  @param tree collect splits from this tree
+	 */
+	void addCandidateSplits(string treeString);
+
+	/**
+	 *  Remove splits that appear from \a treeString.
+	 *  If an existing split has weight > 1, their weight will be
+	 *  reduced by 1.
+	 */
+	void removeCandidateSplits(string treeString);
+
+    int getNumStableSplits() const {
+        return numStableSplits;
+    }
+
+    /**
+     *  Print candidate trees and their likelihood
+     */
+    void printTrees(string suffix);
+
+    /**
+     *  Recompute the log-likelihood of all trees
+     *  @param treeObject the tree object which store other model parameters used
+     *  to compute the log-likelihood.
+     */
+    void recomputeLoglOfAllTrees(IQTree &treeObject);
+
+    int getMaxSize() const {
+        return maxSize;
+    }
+
+    void setMaxSize(int maxSize) {
+        this->maxSize = maxSize;
+    }
+
 private:
+    /**
+     *  Maximum number of candidate trees
+     */
+    int maxSize;
 
     /**
-     *  Set of supported splits by the best trees
+     *  Number of stable splits identified
      */
-    SplitGraph stableSplit;
+    int numStableSplits;
 
     /**
-     *  Shared params pointing to the global params
+     *  Set of splits and the number of their occurences from the current best trees.
+     *  The number of current best tree is parameterized.
      */
-    Params* params;
+	SplitIntMap candSplits;
 
     /**
      *  Map data structure storing <topology_string, score>
@@ -292,7 +391,6 @@ private:
      * pointer to alignment, just to assign correct IDs for taxa
      */
     Alignment *aln;
-
 };
 
 #endif /* CANDIDATESET_H_ */
diff --git a/checkpoint.cpp b/checkpoint.cpp
index 8517ef2..1b26991 100644
--- a/checkpoint.cpp
+++ b/checkpoint.cpp
@@ -30,6 +30,47 @@ Checkpoint::~Checkpoint() {
 void Checkpoint::setFileName(string filename) {
 	this->filename = filename;
 }
+
+
+void Checkpoint::load(istream &in) {
+    string line;
+    string struct_name;
+    size_t pos;
+    int listid = 0;
+    while (!in.eof()) {
+        getline(in, line);
+        pos = line.find('#');
+        if (pos != string::npos)
+            line.erase(pos);
+        line.erase(line.find_last_not_of("\n\r\t")+1);
+//            trimString(line);
+        if (line.empty()) continue;
+        if (line[0] != ' ') {
+            struct_name = "";
+        }
+//            trimString(line);
+        line.erase(0, line.find_first_not_of(" \n\r\t"));
+        if (line.empty()) continue;
+        pos = line.find(": ");
+        if (pos != string::npos) {
+            // mapping
+            (*this)[struct_name + line.substr(0, pos)] = line.substr(pos+2);
+        } else if (line[line.length()-1] == ':') {
+            // start a new struct
+            line.erase(line.length()-1);
+            trimString(line);
+            struct_name = line + '.';
+            listid = 0;
+            continue;
+        } else {
+            // collection
+            (*this)[struct_name + convertIntToString(listid)] = line;
+            listid++;
+        }
+    }
+}
+
+
 void Checkpoint::load() {
 	assert(filename != "");
     if (!fileExists(filename)) return;
@@ -47,41 +88,8 @@ void Checkpoint::load() {
         }
         if (line != header)
         	throw ("Invalid checkpoint file " + filename);
-        string struct_name;
-        size_t pos;
-        int listid = 0;
-        while (!in.eof()) {
-        	getline(in, line);
-            pos = line.find('#');
-            if (pos != string::npos)
-                line.erase(pos);
-            line.erase(line.find_last_not_of("\n\r\t")+1);
-//            trimString(line);
-            if (line.empty()) continue;
-            if (line[0] != ' ') {
-                struct_name = "";
-            }
-//            trimString(line);
-            line.erase(0, line.find_first_not_of(" \n\r\t"));
-            if (line.empty()) continue;
-        	pos = line.find(": ");
-        	if (pos != string::npos) {
-                // mapping
-                (*this)[struct_name + line.substr(0, pos)] = line.substr(pos+2);
-            } else if (line[line.length()-1] == ':') {
-                // start a new struct
-                line.erase(line.length()-1);
-                trimString(line);
-                struct_name = line + '.';
-                listid = 0;
-                continue;
-            } else {
-                // collection
-                (*this)[struct_name + convertIntToString(listid)] = line;
-                listid++;
-//        		throw "':' is expected between key and value";
-            }
-        }
+        // call load from the stream
+        load(in);
         in.clear();
         // set the failbit again
         in.exceptions(ios::failbit | ios::badbit);
@@ -111,6 +119,23 @@ void Checkpoint::setDumpInterval(double interval) {
     dump_interval = interval;
 }
 
+void Checkpoint::dump(ostream &out) {
+    string struct_name;
+    size_t pos;
+    int listid = 0;
+    for (iterator i = begin(); i != end(); i++) {
+        if ((pos = i->first.find('.')) != string::npos) {
+            if (struct_name != i->first.substr(0, pos)) {
+                struct_name = i->first.substr(0, pos);
+                out << struct_name << ':' << endl;
+                listid = 0;
+            }
+            // check if key is a collection
+            out << ' ' << i->first.substr(pos+1) << ": " << i->second << endl;
+        } else
+            out << i->first << ": " << i->second << endl;
+    }
+}
 
 void Checkpoint::dump(bool force) {
     if (filename == "")
@@ -128,25 +153,13 @@ void Checkpoint::dump(bool force) {
             out = new ofstream(filename.c_str()); 
         out->exceptions(ios::failbit | ios::badbit);
         *out << header << endl;
-        string struct_name;
-        size_t pos;
-        int listid = 0;
-        for (iterator i = begin(); i != end(); i++) {
-            if ((pos = i->first.find('.')) != string::npos) {
-                if (struct_name != i->first.substr(0, pos)) {
-                    struct_name = i->first.substr(0, pos);
-                    *out << struct_name << ":" << endl;
-                    listid = 0;
-                }
-                // check if key is a collection
-                *out << "  " << i->first.substr(pos+1) << ": " << i->second << endl;
-            } else
-                *out << i->first << ": " << i->second << endl;
-        }
+        // call dump stream
+        dump(*out);
         if (compression)
             ((ogzstream*)out)->close();
         else
             ((ofstream*)out)->close();
+        delete out;
 //        cout << "Checkpoint dumped" << endl;
     } catch (ios::failure &) {
         outError(ERR_WRITE_OUTPUT, filename.c_str());
@@ -218,6 +231,13 @@ void Checkpoint::startList(int nelem) {
         list_element_precision.push_back(0);
 }
 
+void Checkpoint::setListElement(int id) {
+    list_element.back() = id;
+    stringstream ss;
+    ss << setw(list_element_precision.back()) << setfill('0') << list_element.back();
+    struct_name += ss.str() + ".";
+}
+
 void Checkpoint::addListElement() {
     list_element.back()++;
     if (list_element.back() > 0) {
diff --git a/checkpoint.h b/checkpoint.h
index a708d36..d7c478a 100644
--- a/checkpoint.h
+++ b/checkpoint.h
@@ -77,11 +77,23 @@ public:
     void setHeader(string header);
 
 	/**
+	 * load checkpoint information from an input stram
+     * @param in input stream
+	 */
+	void load(istream &in);
+
+	/**
 	 * load checkpoint information from file
 	 */
 	void load();
 
 	/**
+	 * dump checkpoint information into an output stream
+     * @param out output stream
+	 */
+	void dump(ostream &out);
+
+	/**
 	 * dump checkpoint information into file
 	 * @param force TRUE to dump no matter if time interval exceeded or not
 	 */
@@ -308,7 +320,13 @@ public:
         @param nelem number of elements
     */
     void startList(int nelem);
-    
+
+    /**
+        set the starting list element, should only be called right after startList
+        @param id element ID
+    */
+    void setListElement(int id);
+
     /** 
         add an element to the current list
     */
diff --git a/constrainttree.cpp b/constrainttree.cpp
new file mode 100644
index 0000000..a0f8076
--- /dev/null
+++ b/constrainttree.cpp
@@ -0,0 +1,211 @@
+//
+// C++ Implementation: constrainttree.cpp
+//
+// Description: ConstraintTree class used to guide tree search
+//
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+
+#include "phylotree.h"
+#include "constrainttree.h"
+#include "splitgraph.h"
+
+ConstraintTree::ConstraintTree() : MTree(), SplitIntMap() {
+}
+
+void ConstraintTree::initConstraint(const char *constraint_file, StrVector &fulltaxname) {
+    bool is_rooted = false;
+    MTree::init(constraint_file, is_rooted);
+    if (leafNum <= 3)
+        outError("Constraint tree must contain at least 4 taxa");
+    if (is_rooted)
+        outError("Rooted constraint tree not accepted");
+
+	// collapse any internal node of degree 2
+	NodeVector nodes;
+	getInternalNodes(nodes);
+	int num_collapsed = 0;
+	for (NodeVector::iterator it = nodes.begin(); it != nodes.end(); it++)
+		if ((*it)->degree() == 2) {
+			Node *left = (*it)->neighbors[0]->node;
+			Node *right = (*it)->neighbors[1]->node;
+			double len = (*it)->neighbors[0]->length+(*it)->neighbors[1]->length;
+			left->updateNeighbor((*it), right, len);
+			right->updateNeighbor((*it), left, len);
+			delete (*it);
+			num_collapsed++;
+			if (verbose_mode >= VB_MED)
+				cout << "Node of degree 2 collapsed" << endl;
+		}
+	if (num_collapsed)
+		initializeTree();
+    
+    // build taxon name to ID index
+    StrVector taxname;
+    StrVector::iterator it;
+    getTaxaName(taxname);
+    taxname_index.clear();
+    for (it = taxname.begin(); it != taxname.end(); it++)
+        taxname_index[(*it)] = it - taxname.begin();
+
+    // convert into split system
+    SplitGraph sg;
+    convertSplits(taxname, sg);
+    sg.removeTrivialSplits();
+    for (SplitGraph::iterator sit = sg.begin(); sit != sg.end(); sit++) {
+        if (!(*sit)->containTaxon(0))
+            (*sit)->invert();
+        insertSplit(new Split(**sit), 1);
+    }
+    
+    // check that constraint tree has a subset of taxa
+    StringIntMap fulltax_index;
+    for (it = fulltaxname.begin(); it != fulltaxname.end(); it++)
+        fulltax_index[(*it)] = it - fulltaxname.begin();
+
+    bool err = false;
+        
+    for(it = taxname.begin(); it != taxname.end(); it++)
+        if (fulltax_index.find(*it) == fulltax_index.end()) {
+            cerr << "ERROR: Taxon " << (*it) << " in constraint tree does not appear in full tree" << endl;
+            err = true;
+        }
+    if (err) {
+        outError("Bad constraint tree (see above)");
+    }
+    
+}
+
+
+bool ConstraintTree::isCompatible(StrVector &tax1, StrVector &tax2) {
+
+    assert(!empty());
+    
+    if (tax1.size() <= 1 || tax2.size() <= 1)
+        return true;
+
+    Split sp1(leafNum);
+    Split sp2(leafNum);
+    
+    StrVector::iterator it;
+    StringIntMap::iterator mit;
+    
+    int tax_count1 = 0;
+    
+    for (it = tax1.begin(); it != tax1.end(); it++)
+        if ((mit = taxname_index.find(*it)) != taxname_index.end()) {
+            // taxon found
+            tax_count1++;
+            sp1.addTaxon(mit->second);
+        }
+    if (tax_count1 <= 1)
+        return true;
+        
+    int tax_count2 = 0;
+    for (it = tax2.begin(); it != tax2.end(); it++)
+        if ((mit = taxname_index.find(*it)) != taxname_index.end()) {
+            // taxon found
+            tax_count2++;
+            sp2.addTaxon(mit->second);
+        }
+    
+    if (tax_count2 <= 1) 
+        return true;
+    
+    if (tax_count1 + tax_count2 == leafNum) {
+        // tax1 and tax2 form all taxa in the constraint tree
+        
+        // quick check if this split is contained in the tree
+        Split *res = NULL;
+        if (sp1.containTaxon(0))
+            res = findSplit(&sp1);
+        else
+            res = findSplit(&sp2);
+        if (res) return true;
+        
+        // otherwise, check for compatibility with all splits
+        for (iterator sit = begin(); sit != end(); sit++)
+            if (!sit->first->compatible(sp1))
+               return false;
+        return true;
+    } else {
+        // partial split
+        assert(tax_count1 + tax_count2 < leafNum);
+        Split taxa_mask(sp1);
+        taxa_mask += sp2;
+        Split* subsp = sp1.extractSubSplit(taxa_mask);
+        bool res = true;
+        for (iterator sit = begin(); sit != end(); sit++) {
+            Split *subit = sit->first->extractSubSplit(taxa_mask);
+            if (!subit->compatible(*subsp)) {
+                res = false;
+                delete subit;
+                break;
+            }
+            delete subit;
+        }
+        delete subsp;
+        return res;
+    }
+}
+
+bool ConstraintTree::isCompatible(Node *node1, Node *node2) {
+    if (empty())
+        return true;
+    StrVector taxset1, taxset2;
+    getUnorderedTaxaName(taxset1, node1, node2);
+    getUnorderedTaxaName(taxset2, node2, node1);
+    return isCompatible(taxset1, taxset2);
+}
+
+bool ConstraintTree::isCompatible (MTree *tree) {
+    if (empty())
+        return true;
+    NodeVector nodes1, nodes2;
+    tree->generateNNIBraches(nodes1, nodes2);
+//    tree->getAllInnerBranches(nodes1, nodes2);
+    StrVector taxset1, taxset2;
+    
+    // check that all internal branches are compatible with constraint
+    for (int i = 0; i < nodes1.size(); i++) {
+        taxset1.clear();
+        taxset2.clear();
+        getUnorderedTaxaName(taxset1, nodes1[i], nodes2[i]);
+        getUnorderedTaxaName(taxset2, nodes2[i], nodes1[i]);
+        if (!isCompatible(taxset1, taxset2))
+            return false;
+    }
+    return true;
+}
+
+
+
+bool ConstraintTree::isCompatible(NNIMove &nni) {
+    if (empty())
+        return true;
+    // check for consistency with constraint tree
+    StrVector taxset1, taxset2;
+    
+    // get taxa set 1 (below node1)
+    FOR_NEIGHBOR_DECLARE(nni.node1, nni.node2, it)
+        if (it != nni.node1Nei_it) {
+            getUnorderedTaxaName(taxset1, (*it)->node, nni.node1);
+        }
+    //taxset1 also includes taxa below node2Nei_it if doing NNI 
+    getUnorderedTaxaName(taxset1, (*nni.node2Nei_it)->node, nni.node2);
+    
+    // get taxa set 1 (below node1)
+    FOR_NEIGHBOR(nni.node2, nni.node1, it)
+        if (it != nni.node2Nei_it) {
+            getUnorderedTaxaName(taxset2, (*it)->node, nni.node2);
+        }
+    //taxset2 also includes taxa below node1Nei_it if doing NNI 
+    getUnorderedTaxaName(taxset2, (*nni.node1Nei_it)->node, nni.node1);
+    
+//        getUnorderedTaxaName(taxset1, node1, node2);
+//        getUnorderedTaxaName(taxset2, node2, node1);
+
+    return isCompatible(taxset1, taxset2);
+}
diff --git a/constrainttree.h b/constrainttree.h
new file mode 100644
index 0000000..6749fc6
--- /dev/null
+++ b/constrainttree.h
@@ -0,0 +1,81 @@
+//
+// C++ Interface: phylotree.h
+//
+// Description:
+//
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+
+#ifndef CONSTRAINTTREE_H
+#define CONSTRAINTTREE_H
+
+#include "mtree.h"
+#include "alignment.h"
+
+struct NNIMove;
+
+/**
+    ConstraintTree used to guide tree search.
+    Note that constraint tree may contain only a subset of taxa from a full tree.
+*/
+class ConstraintTree : public MTree, public SplitIntMap {
+public:
+
+    ConstraintTree();
+
+    /**
+            initialize constraint tree
+            @param constraint_file the name of the constraint tree file
+            @param fulltaxname the full list of all taxa names
+     */
+    void initConstraint(const char *constraint_file, StrVector &fulltaxname);
+
+    /** 
+        check if a "partial" split defined by two taxa name sets is compatible with the constraint tree.
+        The union of 2 taxa set do not need to comprise all taxa in the constraint tree.
+        @param[in] tax1 names of taxa in one side of split
+        @param[in] tax2 names of taxa in other side of split
+        @return true if the split is compatible with all splits in the constraint tree, false otherwise.
+     */ 
+    bool isCompatible(StrVector &tax1, StrVector &tax2);
+
+    /**
+        check if a branch defined by two nodes in any tree is compatible or not
+        @param node1 one end node of the branch
+        @param node2 the other end node of the same branch
+        @return TRUE if the branch is compatible, FALSE otherwise 
+    */
+    bool isCompatible(Node *node1, Node *node2);
+
+    /**
+        @param tree input tree
+        @return TRUE if input tree is compatible with constraint, FALSE otherwise
+    */
+    bool isCompatible (MTree *tree);
+
+
+    /** 
+        check if an NNI is compatible with the constraint tree or not
+        @param nni an NNIMove
+        @return TRUE if the NNI is compatible, FALSE otherwise
+    */
+    bool isCompatible(NNIMove &nni);
+
+    /**
+        @param taxname taxon name to search for
+        @return TRUE if constraint tree has a taxon, FALSE otherwise
+    */
+    bool hasTaxon(string &taxname) {
+        return taxname_index.find(taxname) != taxname_index.end();
+    }
+
+protected:
+
+    /* map from taxon name to its index, used for quick taxon name search */
+    StringIntMap taxname_index;
+
+};
+
+#endif
\ No newline at end of file
diff --git a/example/example.nex b/example/example.nex
index 620068c..e3d323e 100644
--- a/example/example.nex
+++ b/example/example.nex
@@ -1,9 +1,9 @@
 #nexus
 
 begin sets;
-	charset part1 = 1-99\3 2-99\3;
-	charset part2 = 3-99\3;
-	charset part3 = 100-384;
+	charset part1 = 1-999\3 2-999\3;
+	charset part2 = 3-999\3;
+	charset part3 = 1000-1998;
 
 	charpartition mine = HKY:part1, GTR+G:part2, GTR+G: part3;
 end;
diff --git a/example/example.phy b/example/example.phy
index 8637b06..cbf0eb3 100644
--- a/example/example.phy
+++ b/example/example.phy
@@ -1,46 +1,18 @@
- 44 384 
-FL-1-103     atgcgcatcacccaaggc---------------------accttctccttcctgcccgacctcacggcggcccaggtcaaggcccagatccagtatgcgctggaccagaactgggcggtctcggtggagtacacggacgatccc------------------------------------------------------catccccggaacacctattgggagatgtggggcctgcccatgttcgacctgcgcgatgccgccggcgtctatggcgaggtcgaggcctgccgcaccgcccatcccggcaagtatgtgcgggtgaacgccttcgactccaatcgcgggtgggagacggtgcgcctctccttcatcgtccagcgtccg
-OSH-1-103    atgcgcatcacccaaggc---------------------tgcttctcgttcctgcccgacctgaccgacgagcagatctcggcgcaggtggactattgcctcggccgcggctgggccgtgagcctcgaacataccgacgacccg------------------------------------------------------catccccggaacacctactgggaaatgtggggcatgccgatgttcgacctgcgcgaccccaagggcgtgatgatcgagctggacgagtgccgcaaggcctggcccggccgctacatccgcatcaatgccttcgattccacccgcggcttcgagacggtcacgatgagcttcatcgtcaaccgcccc
-CEU-1-103    atgcgcatcactcaaggc---------------------actttttccttcctgcccgaactgaccgacgagcagatcaccaaacagctcgaatactgcctgaaccagggctgggcggtcggcctcgaatacaccgacgacccg------------------------------------------------------cacccgcgcaacacgtactgggagatgttcgggctgccgatgttcgacctgcgcgatgccgccggcatcctgatggaaatcaacaacgcgcggaacaccttccccaaccactacatccgcgtcacggccttcgattcgacgcatacggtggagtcggtggtgatgtcgttcatcgtcaatcgtccc
-TH-1-103     atgagacttacacaaggc---------------------gcattttcgttcttacctgacttaacagatgagcaaatcgtaaaacaaattcaatatgctatcagcaaaaactgggctttaaacgttgaatggacagatgatccg------------------------------------------------------caccctcgcaacgcatactgggatttatggggattaccattatttggtattaaagatccagcggctgtaatgtttgaaatcaatgcttgccgtaaagctaaaccagcttgttacgtaaaagtaaatgcgtttgataactcacgtggtgtagaaagctgctgcttatcttttatcgttcaacgtcct
-CAa1-103     atgaaactaacacaagga---------------------gctttctcatttcttcctgacttaactgatgcgcaagtaactaagcaaatccagtacgctttaaataagagttgggctatttcgattgaatatactgatgatccg------------------------------------------------------cacccacgtaacagttactgggagatgtggggccttcctctattcgatgttaaggatccagctgcgattcttttcgaaatcaacatggctcgtaaggctaagcctaactactaccttaaaatagcttgttttgataacacacgtggtatcgaaagttgtgtactttctttcattgtacaacgtcct
-CAb1-103     gtgagagttacacaagga---------------------acattttcttttctaccagacctgacaaatgatcaaatcagaaaacaaattcaatatgccataaataaaggatgggcattgagtgtagaatatacagatgaccct------------------------------------------------------cacccacggaattcttactgggaaatgtggggactgcctttatttgatgtcaaagaccctgcggcaattatgtttgaagttgaagcttgtcgaaaagagaaaagcaactattatattaagctattagcttttgattcaaccaaaggagttgaaagtacagcaatgtcctttatggtcaataggcct
-SI-1-103     atgagagttacacaagga---------------------tgtttttcgtttttaccagatttaagtgatgatcaaattaaacaacaagtttcttacgctatgagcaaaggttgggcggttagtgtagaatggacagatgatcca------------------------------------------------------catccacgtaactcatattgggaattatggggtcttcctttatttgatgttaaagatccagctgcagttatgtatgaacttgctgaatgtagaaaagttaacccagaaggttatattaaaattaatgctttcgatgctagtattggtacagaaagttgtgtaatgtcttttattgtacaacgtcct
-LU-1-103     gtgagacttacacaagga---------------------gctttttcttatttaccagatttaactgatgcacaaatcatcaaacaaattgactactgcttaagcagaggttggtctgttggtgttgaatggactgatgatcca------------------------------------------------------cacccacgtaacgcttactgggaactatggggtcttccattatttgacgtaaaagattcttcagcaattttatacgaagttaatgaatgtcgtcgtttaaaccctgaaggttacattaaattagttgctttcaacgcagcacgtggtactgaaagtagtgcatctgcttttattgtacaacgtcca
-SU-1-103     gtgagaataactcaaggt---------------------accttttcttttttgccggacttgactgatgaacaaatcaaaaaacaaattgattatatgatatctaaaaaattagctataggtattgaatatactaacgacata------------------------------------------------------catcctagaaattcattttgggaaatgtggggattacctctatttgaggtcacagatccagctccagtattatttgaaattaatgcttgtcgtaaagcaaaaagtaatttctatatcaaggtagtaggattttcttctgaaagaggtatagaaagtacaataatttcatttattgtaaatagacca
-RP-56-175    atgcaggtgtggccaccagttggcaagaagaagtttgagaccctttcataccttccacccctcactgatgagcaattgcttaaggaagtagagtatcttctaaggaagggatgggttccatgtgttgaatttgagttggagaaa------------------ggatttgtccaccgtcagtacaacagttcaccaggatactatgatggacgttactggacaatgtggaggttgccattgtttggaaccactgatgctgctcaggtgttgaaggaagttgctgaatgtaaagcagaatacccagaagctttcatccgtatcatcggatttgacaacgttcgt------caagtgcaatgcattagtttcattgcaagcacaccc
-A-14-133     atgcaggtgtggcctccaattggaaagaagaagtttgagactctttcctatttgccaccattgacgagagatcaattgttgaaagaagttgaataccttctgaggaagggatgggttccatgcttggaatttgagttgctcaaa------------------ggatttgtgtacggtgagcacaacaagtcaccaagatactatgatggaagatactggacaatgtggaagcttcctatgtttggcaccactgatcctgctcaagtcgtgaaggaggttgatgaagttgttgccgcttaccccgaagctttcgttcgtgtcatcggtttcaacaacgttcgt------caagttcaatgcatcagtttcattgcacacacacca
-PR-57-176    atgcaggtgtggccaccacgtaatttgaagaagtttgagaccctatcataccttccaactctttccgaggagtcattgttgaaggagatcaactaccttctaatcaagggatgggttccttgccttgagttcgaagttggaccg------------------gcacatgtataccgtgagaacaacaagtcaccaggatactatgacggaaggtactggacaatgtggaagctacccatgttcggatgcactgacgcatcccaagttgcagctgaggtggtcgagtgcaagaacgcttaccctgatgcccacgtcagaatcattggattcgacaacaagcgt------caagtccagtgcatcagtttcattgcctacaaacct
-PY-61-180    atgcaggtgtggcctccactcggactgaagaagttcgagaccctctcttaccttcctcccctttcttccgagtccttggccaaggaagttgactacctcctccgcaagaactgggttccctgcttggaatttgagttggagact------------------ggattcgtgtaccgtgagaaccacaggtccccaggatactatgatggaaggtactggacaatgtggaagctgcccatgttcggatgcaccgactcttcccaggtgttgaaggagctggaagaggccaagaaggcttacccccagtccttcatccgtatcatcggattcgacaatgtccgt------caagtgcagtgcatcagtttcatcgcttacaagcct
-MGI-58-176   atgcaggtgtggccgccggagggcctgaagaagttcgagaccctctcctacctcccccctctctccgtcgaggacctcgccaaggaggtggactacctcctccgcaacgactgggttccctgcatcgagttctccaaggaa---------------------gggttcgtgtaccgcgagaaccacgcgtcgcccgggtactacgacgggcggtactggacgatgtggaagctgcccatgttcggctgcaccgacgccagccaggtgatcgccgaggtggaggaggccaagaaggcctaccccgagtacttcgtcagaatcatcggcttcgacaacaagcgc------caagtccagtgcatcagcttcatcgcctacaagccc
-SCR-58-177   tgcatggtgtggccaccactaggaatgaagaagtttgagactctgtcttacctgccccctctatccgaagagtcattgttgaaggaggtccaataccttctcaacaatggatgggttccctgcttggaattcgagcccactcac------------------ggatttgtgtaccgtgagcacggaaacacaccaggatactacgatggacgttactggacaatgtggaagttgcccatgttcggttgcactgacccatcccaggttgttgctgagctcgaggaggccaagaaggcttaccctgaggccttcatccgtatcataggattcgacaacgtgcgt------caagtccagtgtgtcagtttcatcgcctacaagccc
-SA-60-179    atgaaggtgtggccaccacttggattgaggaagttcgagactctttcttacctgcctgatatgagtaacgaacaattgtcaaaggaatgtgactaccttctcaggaatggatgggttccctgcgttgaattcgacatcggaagc------------------ggattcgtgtaccgtgagaaccacaggtcaccaggattctacgatggacgttactggaccatgtggaagctccctatgtttggctgcaccgactcatctcaggtgattcaggagattgaggaggctaagaaggaataccccgacgcattcatcagggttattggctttgacaacgtccgt------caagtccagtgcatcagtttcatcgcctacaagccc
-BR-60-179    atgcaggtatggccaccacgtgggaagaagttctacgagactctctcataccttccaccccttacaagggagcaattggccaaggaagttgaataccttcttcgcaagggatgggttccttgcttggaattcgagttggagcat------------------ggaaccgtgtaccgtgagtaccacagatcaccagggtactatgatggtcgttactggaccatgtggaagctgcccatgtttggttgcacagatgcagtgcaggtgttgcaggagcttgatgagatgattaaagcttacccagattgctatggtaggatcattggtttcgacaatgttcgc------caagtccagtgcattagtttccttgcctacaagcct
-CPL-58-177   atgcaggtgtggccaccaattaacaagaagaagtacgagactctctcatacctccctgatttgagccaagagcaattgcttagcgaaattgagtaccttttgaaaagtggatgggttccttgcttggaattcgaaactgagcgc------------------ggatttgtctaccgtgaacaccaccattcaccaggatactatgacggcaggtactggaccatgtggaagctacctatgttcggatgcactgatgccacccaagtgttggctgaggtggaagaggcgaagaaggcatacccacaggcctgggtccgtattattggattcgacaacgtgcgt------caagtgcagtgcatcagtttcattgcctacaagcca
-LTU-59-178   atgcaggtgtggccaccaattaacatgaagaaatacgagacattgtcataccttcctgacttgtccgatgagcaattgctcaaggaagttgagtaccttttgaaaaatggatgggttccttgcttggaattcgagactgagcac------------------ggatttgtgtaccgtgagcacaacagctcaccaggatactacgatggtagatactggaccatgtggaagttgcctatgtttgggtgcactgacggaacccaggtgttggctgaggttcaagaggccaagaatgcgtacccacaggcctggatccgtattatcggattcgacaacgttcgt------caagtgcagtgcatcagtttcattgcctacaagcca
-TSP-58-177   atgcaggtgtggcccccatatggcaagaagaagtacgagactctctcataccttcctgatttaaccgacgagcaattgctcaaggagattgagtaccttttgaacaagggatgggttccttgcttggaatttgagactgagcac------------------ggatttgtctaccgtgaataccacgcctcacctagatactatgatggaaggtactggaccatgtggaagttgcccatgtttgggtgcactgatgcaactcaggtgttgggtgagctccaagaggccaagaaggcttaccctaatgcatggatcagaatcatcggattcgacaacgtccgt------caagtgcaatgcatcagtttcattgcctacaagcca
-YBN-56-175   atgcaggtgtggccaccagttggcaagaagaagtttgagactctttcctacctgccagaccttgatgatgcacaattggcaaaggaagtagaataccttcttaggaagggatggattccttgcttggaattcgagttggagcac------------------ggtttcgtgtaccgtgagcacaacaggtcactaggatactacgatggacgctactggaccatgtggaagctgcctatgtttggttgcactgatgcttctcaggtgttgaaggagcttcaagaggctaagactgcataccccaacggcttcatccgtatcatcggattcgacaacgttcgc------caagtgcagtgcatcagcttcatcgcctacaagccc
-AN-56-175    atgaaggtgtggccaccacttggattgaagaagtacgagactctctcatacttaccaccactaactgaaactcagttggctaaggaagtcgactacttgctccgcaaaaaatgggttccttgtttggaattcgagttggagcac------------------ggttttgtctaccgtgagaacgccagatcccccggatactatgacggaagatactggacaatgtggaaattgcctatgttcggttgcaccgactcagcccaagtgatgaaggagcttgctgaatgcaagaaggagtacccccaggcctggatccgtatcatcggatttgacaatgttcgt------caagttcaatgtatcatgttcattgcttccaggcca
-HI-60-179    atgcaggtgtggcctcctcttgggaagaagaagttcgagacactctcatacctccccgatcttacacccgtacagttggctaaggaagtagattaccttcttcgctctaaatggattccttgcttggaattcgaattagaggag------------------ggattcgtgcaccgtaagtactcgagcttacccacgtactacgatggacgctactggaccatgtggaaactgcccatgtttgggtgcactgactcggctcaggtgttggaggagcttgagaattgcaagaaggaataccccaatgcattcattagaatcattgggttcgacaacgttcgt------caagtgcagtgcattagtttcattgcctacaagcct
-ANA-56-175   atgaaggtgtggccaccagttggaaagaagaagtttgagaccctctcttaccttcctgaccttaccgaagttgaattgggtaaggaagtcgactaccttctccgcaacaagtggattccttgtgttgaattcgagttggagcac------------------gggtttgtttaccgtgagcacggaagcacccccggatactacgatggccgttactggacaatgtggaagcttcccttgttcggatgcactgactctgctcaagtgttgaaggaagtccaagaatgcaaaacggagtaccctaacgctttcatcaggatcatcggattcgacaacaaccgt------caggtccagtgcatcagtttcatcgcctacaagcca
-ZE-48-166    atgcaggtgtggccggcctacggcaacaagaagttcgagacgctgtcgtacctgccgccgctgtcgacggacgacctgctgaagcaggtggactacctgctgcgcaacggctggataccctgcctcgagttcagcaaggtc---------------------ggcttcgtgtaccgcgagaactccacctccccgtgctactacgacggccgctactggaccatgtggaagctgcccatgttcggctgcaacgacgccacccaggtgtacaaggagctgcaggaggccatcaaatcctacccggacgccttccaccgcgtcatcggcttcgacaacatcaag------cagacgcagtgcgtcagcttcatcgcctacaagccc
-EAT-48-166   atgcaggtgtggccaattgagggcatcaagaagttcgagaccctgtcttacttgccacccctctccacggaggccctcttgaagcaggtcgactacttgatccgctccaagtgggtgccctgcctcgagttcagcaaggtt---------------------ggcttcgtcttccgtgagcacaacagctcccccgggtactacgacggtcgatactggacaatgtggaagctgcctatgttcgggtgcaccgacgccacacaggtgctcaacgaggtggaggaggttaagaaggagtaccctgatgcgtatgtccgcgtcatcggtttcgacaacatgcgc------caggtgcaatgcgtcagcttcattgccttcaggcca
-YSA-46-164   atgcaggtgtggccgattgagggcatcaagaagttcgagaccctctcctacctgccaccgctcaccgtggaggacctcctgaagcagatcgagtacctagctccgttccaagtggtgccctgcctcgagttcagcaaggtc---------------------ggatttgtctaccgtgagaaccacaagtcccctggatactacgacggcaggtactggaccatgtggaagctgcccatgttcgggtgcaccgacgccacccaggtcgtcaaggagctcgaggaggccaagaaggcgtaccctgatgcattcgtccgtatcatcggcttcgacaacgttagg------caggtgcagctcatcagcttcatcgcctacaacccg
-TH-52-170    atgcaggtgtggcctccattcggaaaccccaagtttgagactctgtcctacctccctacgctaaccgaggagcagctggtgaaggaggttgagtacttgttgaggaacaagtgggtgccttgtctagagtttgatctggaa---------------------ggatccatctcgaggaagtataataggagcccggggtactacgatgggagatactgggtgatgtggaagttgccgatgtttgggtgcacagaggcatctcaggtgataaacgaggtgagagagtgtgccaaggcataccccaaagccttcatccgtgtcattggctttgacaacgtccgc------caagtgcagtgcatctccttcatcgtccacaagccc
-LA-68-186    atgcaggtgtggcctccttacgcgaataaaaagtttgagactctgtcgtatctccctcgcttgaccccggagcaactggtgaaggaggtggagtacctgctgaagaacaagtgggtgccctgcctggaattcgaggaggat---------------------ggtgaaataaagagagtgtatgggaatagcccagggtactacgacgggagatactgggtgatgtggaagctgcctatgttcggatgcacagaggcatcgcaggtgttgaacgaggtgaacgagtgtgcgaaggcataccccaacgccttcatccgcgtcatcggattcgacaacgtccgc------caagtgcagtgcatctccttcatcgtccacaagcct
-GR-854-978   atgaaggtgtggaaccccgtcaacaacaagaagttcgagaccttctcctacctgccccccctgtctgacgcccagatcgccaagcaggtggacatgatcattgccaaggggctctccccctgcctggagttcgccgccccggagaacagcttcatcgccaatgacaacactgtgcgcttcagcggcaccgctgcaggctactatgacaaccggtactggaccatgtggaagctgcccatgttcggctgcacggacgccagccaggtgctgcgtgagatctccgagtgccgcagggcctacccccagtgctacgtccgc---ctggccttcgactccgtcaag------caggtgcaggtgatctcgttcgtggtgcagcgcccc
-MO-29-154    ttcaaggtctggcagcccgtgaacaacaagcagtacgagaccttctcctacctgccccccctgaccaaccagaagatcggccgtcaggtcgactacatcatcaacaacggctggaccccctgcttggagttcgctgacccctccacctccttcgtcagcaacgcgaacgccgtgcgcctccagggtgtctccgctggctactacgacaacaggtactggaccatgtggaagctgcccatgttcggctgcactgaccccagccaggtgctgcgcgaggtgtccgcctgccaggtggccttccccaacgtgtacatccgcctggttgccttcgacaacgtcaag------caggtgcagtgcatgggcttcctagtgcagcgcccc
-OE-36-161    atgatggtatggtagccctttaacaataagttctttgagaccttctcgtacttgccccctctcactgacgaccaaatcaccaagcaagtggactacatcttgagaaacaattggactccttgtctggagtttgcgggatccgaccaagcgtatgtgacccacgacaacacggtaagaatgggagattgtgcatccacttatcaggacaacagatattggaccatgtggaaattgcctatgttcggttgcattgatggatcgcaagtgttgaccgaaatttcagcttgcactaaggcctttcctgatgcctacatccgtttggtgtgttttgatgcaaatagg------caagtccaaatttccggctttttggtacataggccc
-EME-43-168   atgatggtttggtagcccttcaacaacaaaatgtttgaaactttttccttcttgcctcccttgactgatgaacaaattagcaaacaagtggactacatcttggccaactcctggaccccctgtcttgaatttgcagcttctgatcaagcttatgctggcaatgaaaattgcatcagaatgggacctgtggcttctacctaccaagacaatagatattggacaatgtggaagctacctatgtttggatgcacagacggctctcaagtgttgagcgagatccaagcatgcacaaatgctttccccgatgcttacatcagattggtttgttttgacgcaaacaga------taggtgtaaatttctggatttttggtgcacagacct
-LRE-46-171   atgatggtctggaccccggtcaacaacaagatgttcgagaccttctcctacctgccccccctgagcgacgagcagatcgccgcccaggtcgactacattgtcgccaacggctggatcccctgcctggagttcgctgagtcggacaaggcctacgtgtccaacgagtcggccatccgcttcggcagcgtgtcttgcctgtactacgacaaccgctactggaccatgtggaagctgcccatgttcggctgccgcgaccccatgcaggtgctgcgcgagatcgtcgcctgcaccaaggccttccccgatgcctacgtgcgcctggtggccttcgacaaccagaag------caggtgcagatcatgggcttcctggtccagcgcccc
-P6-2-107     atgaaaactctgcccaaa------gagcgtcgtttcgagactttctcgtacctgcctcccctcagcgatcgccaaatcgctgcacaaatcgagtacatgatcgagcaaggcttccaccccttgatcgagttcaacgagcac------------------------------------------------------tcgaatccggaagagttctactggacgatgtggaagctccccctgtttgactgcaagagccctcagcaagtcctcgatgaagtgcgtgagtgccgcagcgaatacggtgattgctacatccgtgtcgctggcttcgacaacatcaag------cagtgccaaaccgtgagcttcatcgttcatcgtccc
-HO-1-106     atgaaaactctgcccaaa------gagcgtcgctacgaaaccctttcctacctgccccccctgagcgatcagcaaattgctcgccagattgagtacatggtgcgcgaaggctatattcccgccgtggaattcaacgaagat------------------------------------------------------tccgacgcgaccacctgctactggaccatgtggaagttgcccctgttccacgccacttctacccaagaagtgttgggcgaagtgcgcgagtgccgcaccgaataccccaactgctacatccgcgtagttggtttcgacaacatcaag------cagtgtcagtccgtgagcttcatcgttcacaagccc
-SP-1-106     atgcaaaccttaccaaaa------gagcgtcgttacgaaaccctttcttacttaccccccctcaccgacgttcaaatcgaaaagcaagtccagtacattctgagccaaggctacattccagccgttgagttcaacgaagtt------------------------------------------------------tctgaacctaccgaactttattggacactgtggaagctacctttgtttggtgctaaaacatcccgtgaagtattggcagaagttcaatcttgccgttctcaatatcctggtcactacatccgtgttgtaggatttgacaatattaag------cagtgccaaatcctgagcttcatcgttcacaaaccc
-PA-1-105     ---atgcaacttagagta------gaacgtaagttcgaaactttttcttatttaccaccattaaacgaccaacagattgcgcgtcaattacaatacgcactttccaatggttatagcccagcaatcgaattcagttttaca------------------------------------------------------ggtaaagctgaagacttagtatggactttatggaaattacctttatttggtgcacaatctcctgaagaagtacttagcgaaattcaagcttgtaaacaacagttccctaatgcttacattcgtgttgtagcatttgactctatcaga------caagttcaaactttaatgttcttagtttacaaacca
-NE-2-109     gctgaaatgcaggattacaagcaaagcctcaaatatgagactttctcttatcttccacccatgaacgcggaacgcatccgcgctcaaatcaagtacgcaattgctcaaggctggagccccggcattgagcacgtagaagtgaaa------------------------------------------------------aactccatgaaccaatattggtacatgtggaaacttcccttcttcggcgaacaaaatgtcgacaacgtgttggctgaaattgaagcgtgtcgtagtgcgtatccaacacaccaggtcaaactggtggcttatgacaactatgcg------caaagcttaggtctggccttcgtggtctaccgcggc
-IFE-2-109    gctgacattcaggactacaactcaacacccaagtacgaaaccttctcttatttgccggcaatgggaccggaaaaaatgcgccgtcagatcgcctatctcatcaatcagggctggaaccccggcatcgagcatgtggaacctgaa------------------------------------------------------cgcgcatcaacatactactggtacatgtggaagttacccatgttcggcgaacagtcggtggacaccgtgatcatggagttggaagcatgccatcgcgctcaccccggccatcacgtgcgcttggtcgggtatgacaattactcg------cagagccagggcagcgcttttgtggtgtttcgcggg
-HS-9-115     ---tcgagcgtcagcgatccgtcgagccgcaagttcgagaccttctcctacctgcccgaactcggcgtggaaaagatccgcaagcaggtcgagtacatcgtcagcaagggctggaacccggccgtcgagcacaccgagccggag------------------------------------------------------aacgccttcgaccactactggtacatgtggaagctgccgatgttcggcgaaaccgacgtggacgccatcctggccgaggccgaggcatgccacaaggcgcatccctcgcatcacgtgcgcctgatcggctacgacaactatgcc------cagtcgcaaggcactgccatggtgatcttccgcggc
-RVI-7-114    agttccagcctcgaagacgtcaacagccgcaagttcgagaccttctcctacctgccgcgcatggatgccgaccgcatccgcaagcaggtcgagtacatcgtctccaagggctggaacccggccatcgagcacaccgagccggaa------------------------------------------------------aacgccttcgatcactactggtacatgtggaagctgccgatgttcggcgagaccgacatcgacaccatcctcaaggaggccgaagcctgccacaaggcgcaccccaacaatcacgtgcgtctgatcggcttcgacaactatgcc------cagtccaagggcgccgagatggtggtctatcgcggc
-IFE-8-115    aaatcccgtctctccgacccggcgagcgcgaagttcgagacactgtcttacctgcccgccctgaccgcggacgagatccgtcaacaggttgcgtatattgtttccaagggctggaatccggcggtagaacataccgaaccggaa------------------------------------------------------aacgccttcggcaactactggtatatgtggaagttgcccatgttcggcgaaacggacgtggacaccattctgaaagaagcggaacgctgccataagcggaatccccataaccacgtccgtatcgtcggctatgataacttcaag------cagtcccagggtacttccctggtagtctatcggggc
-RVI-5-112    agcagcatgggcgatcacgccaccatcggccgctacgagaccttttcctatctgccgccgctcaaccgcgaggagatcctggagcagatcctctacatcctcgacaacggctggaacgcctcgctggagcacgagcatccggat------------------------------------------------------cgcgccttcgagtattactggccgatgtggaagatgcccttcttcggcgaacaggatccgaacgtgatcctgaccgagatcgagtcctgccggcgcagctatccggaccatcacgtccggctggtcggctacgacacctacgcc------cagagcaagggacattccttcctggcgcaccgcccg
-
+17 1998
+LngfishAu  CTCCCACACCCCAGGAACCAGCAGTGATTAACATTGAGCATAAGCGAAGCTTGACTCAGCCACCTCGGGCCGGTAAACCTCGTGCAGCCACCGCGGTTATACGAAGGACCCGATTGATGTCTAGGCGTAAAGGTGATTAATAGACTAATTAGAGTTAAAACCCCATCCAGCCGCGCATCCATAAAATCTAGACTACAACTACTTTCACGAAAGCTAAGATACAAACTGGGATTAGAT-CC-ACTTGCTCAGCCATAAACTTTGACTACTAAAAGGTCCGCCAGTACTACGAGGGCCAGCTTAAAACCCAAAGGACTTGCGGTGCCTACCCACCTAGAGGAGCCTGTTCTAGAACCGATAATCCACGTTAAACCTCACCCTTCTTGCCCCGTCTATATACCACCGTCGCCAGCTTACCCCGGGGTGAAATAAGCACAATTGTCAACCAAAAACGTCAGGTCGAGGTGTAGCGAATGAAGCGG [...]
+LngfishSA  CAACCACACCCCAGGAAACAGCAGTAATTAATCTTAGGCATAAGTGAAACTTGACCTAGTTATTAAAAATCGGCCAATCTCGTGCAGCCGCCGCGGTTATACGAGAGATTTTATTGATAAATTGGCGTATAGGTGATTAGAATACTTATTAAAATTTAACTTTAGCCAAGCTGCGCTTCCGCAAAATCATTATTAAATTATTCCTCACGAAAGCCAAGAAACAAACTAGGATTAGATCCCTACTTGCTTGGCTATAAACTATAGTTTTTAATCAACTCGCCAGGACTACTAGCACAAGCTAAAAACTCAAAGGACTTGCGGTGCCTACCCACCTAGAGGAGCCTGTCCTAAAACCGATAATCCACGTTTTACCTAACCCTTTTTGCCCAGCCTATATACCGCCGTCGCCAGCCAACCCCGGAGAAATATAGGCAAAATTACTAGTTAAATACGTCAGGTCGAGGTGTAGCATATGAAGTGG [...]
+LngfishAf  CAACCACACCCCAGGACACAGCAGTAATTAAAATTGGACATAAGTGTAACTTGATCCAGCCATTAAAAGTTGGCCAACCTCGTGCAGCCGCCGCGGTTACACGAGGAACTTAATTGATGCCTCGGCGTATAGATGATTAGAGAACTTTCTAAAATCAAATATTGGCCCTGCTGCGCGCTCGCAAACTCAAAATTAAATACATCCTCATGAAAGTCAGGAAACAAACTAGGATTAGATCCCTACTTGCCTGACCCTAAACTATGACTCTTAATAAGCCCGCCAGAACTACAAGCCCAAGCTTAAAACCCAAAGGACTTGCGGTGCCTACCCACCTAGAGGAGCCTGTTCTAGAACCGATAATCCACGTTTTACCCAACCTTCCCTAGCCAGCCTATATACCGCCGTCGCCAGCCAACCCCCGGAGACTATTGGCAGAATAGTACATCTAGCACGTCAGGTCGAGGTGTAGCACATGAGAAGG [...]
+Frog       AAATTTGGTCCTGTGATTCAGCAGTGATAAACATTGAACATGAGCGAAGCTCGATTCAGTTATAAAGAGTTGGTCAATCTCGTGCAGCCGCCGCGGTTATACGAGAAACTCAATTGATATTTTGGCGTAAACGTGATTAAGAACCCAACTAGAGTCAAACTCCAACCAAGCTGCGCTTTCGTAAGAACACGAAACAATACACTCTCACGACCGCTAGGAAACAAACTGGGATTAGATCCCCACTTGCCTAGCCATAAACTTTGACTTACGAAAAATCCGCCAGAACTACGAGCCTAAGCTTAAAACCCAAAGGACTTGCGGTGCTCACCCACCTAGAGGAGCCTGTTCTGTAATCGATACCCCTCGCTAAACCTCACCCTTCTTGCCCCGCCTATATACCACCGTCGCCAGCCCACCTCGGAGATTCTTAGGCTTAATGATTTCATCAACACGTCAGGTCAAGGTGTAGCATATGAAGTGG [...]
+Turtle     CTTCCACACCCCAGGACTCAGCAGTGATAAAAATTAAGCATAAGCGAAGCTTGACTTAGTCACAATGAGCTGGTAAATCTCGTGCAGCCACCGCGGTTACACAAGAAGCCCAACTAACGACAAGGCGTAAAAGTGATTAAATACCCATTTAAGGTGAACTACTTACTTCGCTGCGCAAAAGTACACAGAAAATAAAGACTATTCCCACGATCGCTAAAACACAAACTGGGATTAGATCCCCACTTGCTTAGCCCTAAACCTAGATTTTTACAAAATCCGCCAGAATTACGAGCAAAAGCTTAAAACTCTAAGGACTTGCGGTGCTCACCCACCTAGAGGAGCCTGTTCTATAATCGATAACCCACGATCTACCTCACCTCTCTTGCCCAGCCTATATACCACCGTCGCCAGCTTACCCCGGGATACAATAAGCAAGATAAAACCATTAACAAGTCAGGTCAAGGTGTAGCTAACTGAGATG [...]
+Sphenodon  CTCCCACACCCCAGGACACAGCAGTGATTAATATTAAGCATAAGTGAAACTTGACTTAGTTAAGAACGGCCGGTCAATTTCGTGCAGCCACCGCGGTTAAACGATAGGCCGAAGTAAGGCCAAGGCGTAAAAGTAACTAAACCCCCTTCTAAACCCAAGAAAAAACTAAGATGAGTTGTAAAACCTCTGAAGATAAGTAAAATCTTACTAAAATTAAGGGTCAAACTGGGATTAGATCCCCACTTGCTTAATCCTAAACATCGACTAATACAAGGTTCGCCCGAACTACCAGCAAAAGCTAGAAACCCTAAAGACTTGCGGTGCCCAACCCCCTAGAGGAGCCTGTTCTATAATTGATGATCCGCAATAAACCTCACCTTTTTCGCCCAGCCTATATACCGCCGTCGCCAGTCTACCTTGAAGAACTATAGGTCAAACAGAACCGCTAATACGTCAGGTCAAGGTGCAGCCAATAAAATGG [...]
+Lizard     CTTCCACACCCAAGGCATCAGCAGTGATAAACATTAAGCATAAGCGAAGCTTGACTTAGTTATTAAGGGCCGGTAAACCTCGTGCAGCCACCGCGGTTATACGAGGGGCCCAAGCAGCGACTCGGCGTAAATGTGGCCAAAACCAACATAAAAACTAAACAATAGCCTAACCGAAACACAGGAAATGCAAACGTAACGTAATTCCCACGAAAACTAAGAAACAAACTAGGATTAGATCCCTACTTGCTTAGTCGTTAATACGATATTACACTAAATCCGCCAGAACTACGAGCGAAAGCTTAAAACTCAAAGGACTTGCGGTGCTCACCGACTTAGAGGAGCCTGTCCTATAATCGATACTCCACGCTAAACCTCACCATCTTTGCCCAGCCTATATACCGCCGTCGTCAACCTACCTCAGAGAAAAATAAGTACAAAAGTAAAACTAAAACGTCAGGTCAAGGTGTAGCTAATAGAATGG [...]
+Crocodile  CTCCCACACCCCAGGCCACAGCAGTAGTTAATATTAGGCATAAGCGAAGCCTGACCTAGTAAGGAAGGGCCGGTTAATCTCGTGCAGCGACCGCGGTTATACGACAGACCCAAATAATGATACGGCGTAAAGACGACTATATTCCCTACCTAGGGAAGAATAACCCCAGGCTGAGCCATAGGAATAACATATTCAAAACAACTCTCGTGAAAGCTAGGACATAAACTAAGATTAGATCCTTACTTACCTAGCTGTAACACAATAATCAAACCTAATTCGCCAAAACTACGAGCAATAGCTTAAAACTCAAAGGACTTGCGGCACTTACCCCCCTAGAGGAGCCTGTCCTATAATCGACAGTACACGTTACACCCGACCCCTTTAGCCCAGTCTGTATACCGCCGTCGCAAGCCCGTCCCAGGGAAACACGCGCGCAACAGTCGAGCTAACACGTCAGGTCAAGGTGCAGCCAACAAGGTGG [...]
+Bird       CTACCACACCCCAGGACTCAGCAGTAATTAACCTTAAGCATAAGTGTAACTTGACTTAGCCACAAAGGGTTGGTAAATCTTGTGCAGCCACCGCGGTCATACAAGAAACCCAATCAATGCTACGGCGTAAAGGTGGCCAATTTGCACCCTAAGATTAAAATGCAACCAAGCTGAGCCTAAGAAACCCCAAATCCAAGTTAATTCCCACGAAAGCTAGGACCCAAACTGGGATTAGATCCCCACTTGCCTAGCCCTAAATCTAGATTCCCACACAATCCGCCTGAACTACGAGCACAAGCTTAAAACTCTAAGGACTTGCGGTGCCCACCCACCTAGAGGAGCCTGTTCTATAATCGATAATCCACGATTCACCCAACCCCCCTTGCCCAGCCTACATACCGCCGTCGCCAGCCCACCTCTAAGAACAATGAGCTCAATAGCCTCGCTAATAAGACAGGTCAAGGTATAGCCTATGGGGTGG [...]
+Human      CTACCACACCCCAGGAAACAGCAGTGATTAACCTTTAGCATAAACGAAGTTTAACTAAGCTATAAAGGGTTGGTCAATTTCGTGCAGCCACCGCGGTCACACGATTAACCCAATCAATGAAGCGGCGTAAAGGTGTTTTGACCCTCCCATAAAGCTAAAACTCACCTGAGTTGAACTCCAGTAAAATTACGAAAAAAATCTGACACACAATAGCTAAGACCCAAACTGGGATTAGATCCCCACTTGCTTAGCCCTAAACCTCAACTAACACAAAGCTCGCCAGCACTACGAGCCACAGCTTAAAACTCAAAGGACCTGCGGTGCTTATCCCTCTAGAGGAGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCCCTCTTGCTCAGCCTATATACCGCCATCTTCAGCAAACCCTGAGGCTACATAAGCGCAAGTACCACGTAAAGACGTTAGGTCAAGGTGTAGCCCATGAGGTGG [...]
+Seal       CCACCACACCCCAGGATACAGCAGTAATAAAAATTAAGCATGAACGAAGTTTGACTAAGCTATAAAGGGTTGGTAAATTTCGTGCAGCCACCGCGGTCATACGATTAACCCAACTAATGGCCCGGCGTAAAGGTGTTAAGACAACCCACTAAAGCTAAAACCTAACCAAGCCGAGCTACCGTAAAATCACGAAAACAATTCTGTGCACGATAGCTAAGATCCAAACTGGGATTAGATCCCCACTTGCTTAGCCCTAAACATAAATTCATACAAAATTCGCCAGAACTACTAGCAACAGCTTAAAACTCAAAGGACTTGCGGTGCTTACCCCTCTAGAGGAGCCTGTTCTGTAATCGATAAACCCCGATAAACCTCACCTTCCTTGCTCAGTCTATATACCGCCATCTTCAGCAAACCCTTAGGAACAATAAGCACAATAACTACATAAAAAAGTTAGGTCAAGGTGTAACCTATGGAATGG [...]
+Cow        CTACCACACCCCAGGAAACAGCAGTGACAAAAATTAAGCATAAACGAAGTTTGACTAAGTTATAAAGGGTTGGTAAATCTCGTGCAGCCACCGCGGTCATACGATTAACCCAACTAACGGAGTGGCGTAAAAGTGTTAAGCCCATACCATAGGGTTAAATTCTAACTAAGCTGAGCCATGATAAAATGACGAAAACAAGCCGACGCACTATAGCTAAGACCCAAACTGGGATTAGATCCCCACTTGCTTAGCCCTAAACACAGATTACAACAAAATTCGCCAGTACTACTAGCAACAGCTTAAAACTCAAAGGACTTGCGGTGCTTATCCTTCTAGAGGAGCCTGTTCTATAATCGATAAACCCCGATAAACCTCACCATTCTTGCTCAGTCTATATACCGCCATCTTCAGCAAACCCTAAGGAAAAATAAGCGTAATTAGTACATAAAAACGTTAGGTCAAGGTGTAACCTATGAAATGG [...]
+Whale      CTACCACGCCCCAGGACACAGCAGTGATAAAAATTAAGCATAAACGAAGTTCGACTAAGTCATAAAGGGTTGGTAAACTTCGTGCAGCCACCGCGGTCATACGATCGACCCAATTAATGAAGCGGCGTAAAGGTGTTAAGACCACATGATAAAGTCAAACCTTAATTAAGCTGAGCCCTAATAAGCCTACGAAAAAAAATCTGCACACGACAGCTAAGATCCAAACTGGGATTAGATCCCCACTTGCTTAGTCGTAAACCCCAATCACAACAAGATTCGCCAGTACTACTAGCAACAGCCTAAAACTCAAAGGACTTGCGGTGCCTACCCATCTAGAGGAGCCTGTTCTGTAACCGATAAACCCCGATCAACCTCACCACCCTTGCTCAGTCTATATACCGCCATCTTCAGCAAACCCTAAGGGAGAATAAGCATAACCACTACATAAAAACGTTAGGTCAAGGTGTAACCCATGGGTTGG [...]
+Mouse      CTACCACACCCCAGGACTCAGCAGTGATAAATATTAAGCATAAACGAAGTTTGACTAAGTTATACAGGGTTGGTAAATTTCGTGCAGCCACCGCGGTCATACGATTAACCCAACTAATATCTTGGCGTAAAAGTGTCAATAAATAAATATAGAATTAAAATCCAACTTATATGATTCATTGTAAACTAACGAAAAGCTTTATATACACGACAGCTAAGACCCAAACTGGGATTAGATCCCCACTTGCTTAGCCATAAACCTAAATTAATACAAAATTTGCCAGAACTACTAGCCATAGCTTAAAACTCAAAGGACTTGCGGTACTTATCCATCTAGAGGAGCCTGTTCTATAATCGATAAACCCCGCTCTACCTCACCTCTCTTGCTCAGCCTATATACCGCCATCTTCAGCAAACCCTAAGGTATTATAAGCAAAAGAACAACATAAAAACGTTAGGTCAAGGTGTAGCCAATGAAATGG [...]
+Rat        CTACCACACCCCAGGACTCAGCAGTGATAAATATTAAGCATGAACGAAGTTTGACTAAGCTATACAGGGTTGGTAAATTTCGTGCAGCCACCGCGGTCATACGATTAACCCAACTAATATTTTGGCGTAAAAGTGCCAATAAATCTCAATAGAATTAAAATCCAACTTATATGATTCATTGTAAGCCAACGAAAAACTTTATATGCACGATAGCTAAGACCCAAACTGGGATTAGATCCCCACTTGCTTAGCCCTAAACCTTAATTAATACAAAATTTGCCAGAACTACTAGCTACAGCTTAAAACTCAAAGGACTTGCGGTACTTATCCATCTAGAGGAGCCTGTTCTATAATCGATAAACCCCGTTCTACCTTACCCTTCTCGCTCAGCCTATATACCGCCATCTTCAGCAAACCCTAAGGCACTATAAGCACAAGAACAACATAAAAACGTTAGGTCAAGGTGTAGCCAATGAAGCGG [...]
+Platypus   CCTCCACACCCCAGGACACAGCAGTAATAGAAATTAGTCATAAACGCAGTTTGAACAAGTCATCAAGAGTCGGTAAATTTCGTGCAGCCACCGCGGTCATACGATTGACTCAACTAACAATAAGGTGTAAAAGTGTTTAAAATTAAACATAAGATTAAAGTAGAACTAAACTGAGTCATAGTAAAGCTACGAAAAGCAATTGATACACGATAGCTAAGGTACAAACTGGGATTAGATCCCCACTTGCTTAGCCCTAAACTCAAGTTTAAACAAAACTCACCAGAACTACTAGCAACAGCTTAAAACTCAAAGGACTTGCGGTGCTTACCCCTCTAGAGGAGCCTGTTCTATAATCGATAAACCCCGATACACCTCACCTCTTTTGCCCTGTCTATATACCGCCATCGTCAGCCAACCCTAAGGAACAATAGGCGTAATCATTTCATAAAAACGTTAGGTCAAGGTGTAGCCTATAAGATGG [...]
+Opossum    CTTCCACACCCCAGGAGACAGCAGTGATTAAAATTAAGCATAAACGAAGTTTGACTAAGTCATTTAGGGTTGGTCAATTTCGTGCAGCCACCGCGGTCATACGATTAACCCAATTAATAATAAGGCGTAAAGGTGTTTAGTAATACAAATAAAGTTAATAATTAACTAAACTGCGTTCTAGTAAAATAATAAAAAAACACCGATACACGAAAACTAAGACACAAACTGGGATTAGATCCCCACTTGCTTAGTAATAAACTAAAATTTAAACAAAATTCGCCAGAACTACTAGCAATTGCTTAAAACTCAAAGGACTTGCGGTGCCCACCCACCTAGAGGAGCCTGTTCTATAATCGATAAACCCCGATAAACCAGACCTATCTTGCCCAGCCTATATACCGCCATCGTCAGCTAACCTTTAAGAATTATAAGCAAAATCAAAACATAAAAACGTTAGGTCAAGGTGTAGCATATGATAAGG [...]
diff --git a/gsl/gauss.cpp b/gsl/gauss.cpp
index b6c2d0e..089c418 100644
--- a/gsl/gauss.cpp
+++ b/gsl/gauss.cpp
@@ -47,6 +47,18 @@
 //#include <gsl/gsl_cdf.h>
 #define GSL_DBL_EPSILON        2.2204460492503131e-16
 
+#ifndef M_2_SQRTPI
+#define M_2_SQRTPI  1.12837916709551257389615890312154517   /* 2/sqrt(pi)     */
+#endif
+
+#ifndef M_SQRT2
+#define M_SQRT2     1.41421356237309504880168872420969808   /* sqrt(2)        */
+#endif
+
+#ifndef M_SQRT1_2
+#define M_SQRT1_2   0.707106781186547524400844362104849039  /* 1/sqrt(2)      */
+#endif
+
 #ifndef M_1_SQRT2PI
 #define M_1_SQRT2PI (M_2_SQRTPI * M_SQRT1_2 / 2.0)
 #endif
diff --git a/hashsplitset.h b/hashsplitset.h
index f86fdd0..ea06192 100644
--- a/hashsplitset.h
+++ b/hashsplitset.h
@@ -122,7 +122,24 @@ public:
 	 * @param use_index TRUE to map to index of splits in sg, FALSE to map to split weights
 	*/
 	void buildMap(SplitGraph &sg, bool use_index = true);
-	
+
+	int getNumTree() {
+		return numTree;
+	}
+
+	void setNumTree(int maxValue) {
+		this->numTree = maxValue;
+	}
+
+private:
+
+	/**
+	 *  The maximum weight value. If the splits are generated from n trees and splits of every tree
+	 *  all have weight = 1, then maxValue = n
+	 *  This variable is used to determine whether a split appear on all input trees.
+	 */
+	int numTree;
+
 };
 
 #endif
diff --git a/iqtree.cpp b/iqtree.cpp
index 0a9f34c..979bc4a 100644
--- a/iqtree.cpp
+++ b/iqtree.cpp
@@ -1,6 +1,8 @@
 /***************************************************************************
- *   Copyright (C) 2009 by BUI Quang Minh   *
- *   minh.bui at univie.ac.at   *
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
@@ -25,17 +27,18 @@
 #include "model/modelgtr.h"
 #include "model/rategamma.h"
 #include <numeric>
-#include "pll/pllInternal.h"
+#include "tools.h"
+#include "MPIHelper.h"
 #include "pllnni.h"
-#include "vectorclass/vectorclass.h"
-#include "vectorclass/vectormath_common.h"
 
+#ifdef _IQTREE_MPI
+#include <mpi.h>
+#endif
 
-Params *globalParam;
+Params *globalParams;
 Alignment *globalAlignment;
 extern StringIntMap pllTreeCounter;
 
-
 IQTree::IQTree() : PhyloTree() {
     IQTree::init();
 }
@@ -46,8 +49,6 @@ void IQTree::init() {
     k_delete = k_delete_min = k_delete_max = k_delete_stay = 0;
     dist_matrix = NULL;
     var_matrix = NULL;
-    nni_count_est = 0.0;
-    nni_delta_est = 0;
 //    curScore = 0.0; // Current score of the tree
     cur_pars_score = -1;
 //    enable_parsimony = false;
@@ -55,8 +56,8 @@ void IQTree::init() {
     nni_cutoff = -1e6;
     nni_sort = false;
     testNNI = false;
-    print_tree_lh = false;
-    write_intermediate_trees = 0;
+//    print_tree_lh = false;
+//    write_intermediate_trees = 0;
 //    max_candidate_trees = 0;
     logl_cutoff = 0.0;
     len_scale = 10000;
@@ -64,7 +65,22 @@ void IQTree::init() {
     duplication_counter = 0;
     //boot_splits = new SplitGraph;
     pll2iqtree_pattern_index = NULL;
-    fastNNI = true;
+
+    treels_name = Params::getInstance().out_prefix;
+    treels_name += ".treels";
+    out_lh_file = Params::getInstance().out_prefix;
+    out_lh_file += ".treelh";
+    site_lh_file = Params::getInstance().out_prefix;
+    site_lh_file += ".sitelh";
+
+    if (Params::getInstance().print_tree_lh) {
+        out_treelh.open(out_lh_file.c_str());
+        out_sitelh.open(site_lh_file.c_str());
+    }
+
+    if (Params::getInstance().write_intermediate_trees)
+        out_treels.open(treels_name.c_str());
+
 }
 
 IQTree::IQTree(Alignment *aln) : PhyloTree(aln) {
@@ -77,41 +93,49 @@ void IQTree::setCheckpoint(Checkpoint *checkpoint) {
     candidateTrees.setCheckpoint(checkpoint);
 }
 
-void IQTree::saveCheckpoint() {
-    stop_rule.saveCheckpoint();
-    candidateTrees.saveCheckpoint();
-    
-    if (boot_samples.size() > 0 && !boot_trees.front().empty()) {
-        checkpoint->startStruct("UFBoot");
-//        CKP_SAVE(max_candidate_trees);
-        CKP_SAVE(logl_cutoff);
-        // save boot_samples and boot_trees
-        int id = 0;
+void IQTree::saveUFBoot(Checkpoint *checkpoint) {
+    checkpoint->startStruct("UFBoot");
+    if (MPIHelper::getInstance().isWorker()) {
+        CKP_SAVE(sample_start);
+        CKP_SAVE(sample_end);
         checkpoint->startList(boot_samples.size());
+        checkpoint->setListElement(sample_start-1);
         // TODO: save boot_trees_brlen
-        for (vector<BootValType* >::iterator it = boot_samples.begin(); it != boot_samples.end(); it++, id++) {
+        for (int id = sample_start; id != sample_end; id++) {
             checkpoint->addListElement();
             stringstream ss;
             ss.precision(10);
             ss << boot_counts[id] << " " << boot_logl[id] << " " << boot_orig_logl[id] << " " << boot_trees[id];
             checkpoint->put("", ss.str());
-//            string &bt = boot_trees[id];
-//            CKP_SAVE(bt);
-//            double bl = boot_logl[id];
-//            CKP_SAVE(bl);
-//            double bol=boot_orig_logl[id];
-//            CKP_SAVE(bol);
-//            int bc = boot_counts[id];
-//            CKP_SAVE(bc);
         }
         checkpoint->endList();
+    } else {
+        CKP_SAVE(logl_cutoff);
         CKP_SAVE(boot_consense_logl);
         int boot_splits_size = boot_splits.size();
         CKP_SAVE(boot_splits_size);
-        checkpoint->endStruct();
+        checkpoint->startList(boot_samples.size());
+        // TODO: save boot_trees_brlen
+        for (int id = 0; id != boot_samples.size(); id++) {
+            checkpoint->addListElement();
+            stringstream ss;
+            ss.precision(10);
+            ss << boot_counts[id] << " " << boot_logl[id] << " " << boot_orig_logl[id] << " " << boot_trees[id];
+            checkpoint->put("", ss.str());
+        }
+        checkpoint->endList();
+    }
+    checkpoint->endStruct();
+}
 
+void IQTree::saveCheckpoint() {
+    stop_rule.saveCheckpoint();
+    candidateTrees.saveCheckpoint();
+    
+    if (boot_samples.size() > 0 && !boot_trees.front().empty()) {
+        saveUFBoot(checkpoint);
         // boot_splits
-        id = 0;
+        int id = 0;
         for (vector<SplitGraph*>::iterator sit = boot_splits.begin(); sit != boot_splits.end(); sit++, id++) {
             checkpoint->startStruct("UFBootSplit" + convertIntToString(id));
             (*sit)->saveCheckpoint();
@@ -122,6 +146,27 @@ void IQTree::saveCheckpoint() {
     PhyloTree::saveCheckpoint();
 }
 
+void IQTree::restoreUFBoot(Checkpoint *checkpoint) {
+    checkpoint->startStruct("UFBoot");
+    // save boot_samples and boot_trees
+    int id;
+    checkpoint->startList(params->gbo_replicates);
+    int sample_start, sample_end;
+    CKP_RESTORE(sample_start);
+    CKP_RESTORE(sample_end);
+    checkpoint->setListElement(sample_start-1);
+    for (id = sample_start; id != sample_end; id++) {
+        checkpoint->addListElement();
+        string str;
+        checkpoint->getString("", str);
+        assert(!str.empty());
+        stringstream ss(str);
+        ss >> boot_counts[id] >> boot_logl[id] >> boot_orig_logl[id] >> boot_trees[id];
+    }
+    checkpoint->endList();
+    checkpoint->endStruct();
+}
+
 void IQTree::restoreCheckpoint() {
     PhyloTree::restoreCheckpoint();
     stop_rule.restoreCheckpoint();
@@ -144,18 +189,6 @@ void IQTree::restoreCheckpoint() {
             checkpoint->getString("", str);
             stringstream ss(str);
             ss >> boot_counts[id] >> boot_logl[id] >> boot_orig_logl[id] >> boot_trees[id];
-//            string bt;
-//            CKP_RESTORE(bt);
-//            boot_trees[id] = bt;
-//            double bl;
-//            CKP_RESTORE(bl);
-//            boot_logl[id] = bl;
-//            double bol;
-//            CKP_RESTORE(bol);
-//            boot_orig_logl[id] = bol;
-//            int bc;
-//            CKP_RESTORE(bc);
-//            boot_counts[id] = bc;
         }
         checkpoint->endList();
         CKP_RESTORE(boot_consense_logl);
@@ -178,14 +211,16 @@ void IQTree::restoreCheckpoint() {
 }
 
 void IQTree::initSettings(Params &params) {
-    searchinfo.speednni = params.speednni;
+
     searchinfo.nni_type = params.nni_type;
     optimize_by_newton = params.optimize_by_newton;
-    setLikelihoodKernel(params.SSE);
-    candidateTrees.init(this->aln, &params);
-//    if (params.maxtime != 1000000) {
-//        params.autostop = false;
-//    }
+    if (num_threads > 0)
+        setLikelihoodKernel(params.SSE, num_threads);
+    else
+        setLikelihoodKernel(params.SSE, params.num_threads);
+    candidateTrees.init(this->aln, 200);
+    intermediateTrees.init(this->aln, 200000);
+
     if (params.min_iterations == -1) {
         if (!params.gbo_replicates) {
             if (params.stop_condition == SC_UNSUCCESS_ITERATION) {
@@ -253,12 +288,12 @@ void IQTree::initSettings(Params &params) {
     nni_sort = params.nni_sort;
     testNNI = params.testNNI;
 
-    globalParam = ¶ms;
+	globalParams = ¶ms;
     globalAlignment = aln;
 
-    write_intermediate_trees = params.write_intermediate_trees;
+    //write_intermediate_trees = params.write_intermediate_trees;
 
-    if (write_intermediate_trees > 2 || params.gbo_replicates > 0) {
+    if (Params::getInstance().write_intermediate_trees > 2 || params.gbo_replicates > 0) {
         save_all_trees = 1;
     }
     if (params.gbo_replicates > 0) {
@@ -268,7 +303,7 @@ void IQTree::initSettings(Params &params) {
     }
 //    if (params.gbo_replicates > 0 && params.do_compression)
 //        save_all_br_lens = true;
-    print_tree_lh = params.print_tree_lh;
+//    print_tree_lh = params.print_tree_lh;
 //    max_candidate_trees = params.max_candidate_trees;
 //    if (max_candidate_trees == 0)
 //        max_candidate_trees = aln->getNSeq() * params.step_iterations;
@@ -294,6 +329,20 @@ void IQTree::initSettings(Params &params) {
         cout << "Generating " << params.gbo_replicates << " samples for ultrafast bootstrap (seed: " << params.ran_seed << ")..." << endl;
         // allocate memory for boot_samples
         boot_samples.resize(params.gbo_replicates);
+        sample_start = 0;
+        sample_end = boot_samples.size();
+
+        // compute the sample_start and sample_end
+        if (MPIHelper::getInstance().getNumProcesses() > 1) {
+            int num_samples = boot_samples.size() / MPIHelper::getInstance().getNumProcesses();
+            if (boot_samples.size() % MPIHelper::getInstance().getNumProcesses() != 0)
+                num_samples++;
+            sample_start = MPIHelper::getInstance().getProcessID() * num_samples;
+            sample_end = sample_start + num_samples;
+            if (sample_end > boot_samples.size())
+                sample_end = boot_samples.size();
+        }
+
         size_t orig_nptn = getAlnNPattern();
 #ifdef BOOT_VAL_FLOAT
         size_t nptn = get_safe_upper_limit_float(orig_nptn);
@@ -488,17 +537,7 @@ void IQTree::computeInitialTree(string &dist_file, LikelihoodKernel kernel) {
     	params->numNNITrees = params->min_iterations;
     int fixed_number = 0;
     setParsimonyKernel(kernel);
-    
-    candidateTrees.init(aln, params);
-//    restoreCheckpoint();
-//    if (leafNum != 0) {
-//        if (!candidateTrees.empty()) {
-//            readTreeString(candidateTrees.getTopTrees(1)[0]);
-//            cout << endl << "CHECKPOINT: Current best tree restored, LogL: " << candidateTrees.getBestScore() << endl;
-//        } else
-//            cout << endl << "CHECKPOINT: Initial tree restored" << endl;
-//        return;
-//    } else 
+
     if (params->user_file) {
         // start the search with user-defined tree
         cout << "Reading input tree file " << params->user_file << " ..." << endl;
@@ -511,18 +550,17 @@ void IQTree::computeInitialTree(string &dist_file, LikelihoodKernel kernel) {
         	fixed_number = wrapperFixNegativeBranch(false);
         params->numInitTrees = 1;
         params->numNNITrees = 1;
-        // change to old kernel if tree is multifurcating
-//		if ((params->SSE == LK_EIGEN || params->SSE == LK_EIGEN_SSE) && !isBifurcating()) {
-//			cout << "NOTE: Changing to old kernel as input tree is multifurcating" << endl;
-//			params->SSE = LK_SSE;
-//		}
 		if (params->pll)
 			pllReadNewick(getTreeString());
     } else if (CKP_RESTORE(initTree)) {
         readTreeString(initTree);
         cout << endl << "CHECKPOINT: Initial tree restored" << endl;
     } else {
-        switch (params->start_tree) {
+        START_TREE_TYPE start_tree = params->start_tree;
+        // only own parsimony kernel supports constraint tree
+        if (!constraintTree.empty())
+            start_tree = STT_PARSIMONY;
+        switch (start_tree) {
         case STT_PARSIMONY:
             // Create parsimony tree using IQ-Tree kernel
             if (kernel == LK_EIGEN_SSE)
@@ -543,7 +581,7 @@ void IQTree::computeInitialTree(string &dist_file, LikelihoodKernel kernel) {
         case STT_PLL_PARSIMONY:
             cout << endl;
             cout << "Create initial parsimony tree by phylogenetic likelihood library (PLL)... ";
-            pllInst->randomNumberSeed = params->ran_seed;
+            pllInst->randomNumberSeed = params->ran_seed + MPIHelper::getInstance().getProcessID();
             pllComputeRandomizedStepwiseAdditionParsimonyTree(pllInst, pllPartitions, params->sprDist);
             resetBranches(pllInst);
             pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions, pllInst->start->back,
@@ -569,6 +607,9 @@ void IQTree::computeInitialTree(string &dist_file, LikelihoodKernel kernel) {
         checkpoint->dump(true);
     }
 
+    if (!constraintTree.isCompatible(this))
+        outError("Initial tree is not compatible with constraint tree");
+
     if (fixed_number) {
         cout << "WARNING: " << fixed_number << " undefined/negative branch lengths are initialized with parsimony" << endl;
     }
@@ -587,7 +628,31 @@ void IQTree::computeInitialTree(string &dist_file, LikelihoodKernel kernel) {
     }
 }
 
-void IQTree::createInitTrees(int nParTrees) {
+int IQTree::addTreeToCandidateSet(string treeString, double score, bool updateStopRule, int sourceProcID) {
+    double curBestScore = candidateTrees.getBestScore();
+    int pos = candidateTrees.update(treeString, score);
+    if (updateStopRule) {
+        stop_rule.setCurIt(stop_rule.getCurIt() + 1);
+        if (score > curBestScore) {
+            if (pos != -1) {
+                stop_rule.addImprovedIteration(stop_rule.getCurIt());
+                cout << "BETTER TREE FOUND at iteration " << stop_rule.getCurIt() << ": " << score << endl;
+            } else {
+                cout << "UPDATE BEST LOG-LIKELIHOOD: " << score << endl;
+            }
+            bestcandidate_changed = true;
+            // COMMENT OUT: not safe with MPI version
+//            printResultTree();
+        }
+
+        curScore = score;
+        printIterationInfo(sourceProcID);
+    }
+    return pos;
+}
+
+void IQTree::initCandidateTreeSet(int nParTrees, int nNNITrees) {
+
     if (nParTrees > 0) {
         if (params->start_tree == STT_RANDOM_TREE)
             cout << "Generating " << nParTrees  << " random trees... ";
@@ -596,17 +661,22 @@ void IQTree::createInitTrees(int nParTrees) {
         cout.flush();
     }
     double startTime = getRealTime();
-    int numDupPars = 0;
+
 #ifdef _OPENMP
     StrVector pars_trees;
     if (params->start_tree == STT_PARSIMONY && nParTrees >= 1) {
         pars_trees.resize(nParTrees);
+        if (aln->ordered_pattern.empty())
+            aln->orderPatternByNumChars();
         #pragma omp parallel
         {
             PhyloTree tree;
+            if (params->constraint_tree_file) {
+                tree.constraintTree.initConstraint(params->constraint_tree_file, aln->getSeqNames());
+            }
             tree.setParams(params);
             tree.setParsimonyKernel(params->SSE);
-            #pragma omp for
+            #pragma omp for schedule(dynamic)
             for (int i = 0; i < nParTrees; i++) {
                 tree.computeParsimonyTree(NULL, aln);
                 pars_trees[i] = tree.getTreeString();
@@ -614,12 +684,18 @@ void IQTree::createInitTrees(int nParTrees) {
         }
     }
 #endif
+
+    int init_size = candidateTrees.size();
+
+    int processID = MPIHelper::getInstance().getProcessID();
+//    unsigned long curNumTrees = candidateTrees.size();
     for (int treeNr = 1; treeNr <= nParTrees; treeNr++) {
+        int parRandSeed = Params::getInstance().ran_seed + processID * nParTrees + treeNr;
         string curParsTree;
 
         /********* Create parsimony tree using PLL *********/
         if (params->start_tree == STT_PLL_PARSIMONY) {
-			pllInst->randomNumberSeed = params->ran_seed + treeNr * 12345;
+			pllInst->randomNumberSeed = parRandSeed;
 	        pllComputeRandomizedStepwiseAdditionParsimonyTree(pllInst, pllPartitions, params->sprDist);
 	        resetBranches(pllInst);
 			pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions,
@@ -636,153 +712,163 @@ void IQTree::createInitTrees(int nParTrees) {
         } else if (params->start_tree == STT_PARSIMONY) {
             /********* Create parsimony tree using IQ-TREE *********/
 #ifdef _OPENMP
-            curParsTree = pars_trees[treeNr-1];
+            if (params->start_tree == STT_PARSIMONY)
+                curParsTree = pars_trees[treeNr-1];
+            else
+                curParsTree = generateParsimonyTree(parRandSeed);
 #else
-            computeParsimonyTree(NULL, aln);
-            curParsTree = getTreeString();
+            curParsTree = generateParsimonyTree(parRandSeed);
 #endif
-        } else {
-            assert(0);
         }
-
-        if (candidateTrees.treeExist(curParsTree)) {
-            numDupPars++;
-            continue;
-        } else {
-            if (params->count_trees) {
-                string tree = getTopology();
-                if (pllTreeCounter.find(tree) == pllTreeCounter.end()) {
-                    // not found in hash_map
-                    pllTreeCounter[curParsTree] = 1;
-                } else {
-                    // found in hash_map
-                    pllTreeCounter[curParsTree]++;
-                }
-        	}
-        	candidateTrees.update(curParsTree, -DBL_MAX, false);
+        
+        int pos = addTreeToCandidateSet(curParsTree, -DBL_MAX, false, MPIHelper::getInstance().getProcessID());
+        // if a duplicated tree is generated, then randomize the tree
+        if (pos == -1) {
+            readTreeString(curParsTree);
+            string randTree = doRandomNNIs();
+            addTreeToCandidateSet(randTree, -DBL_MAX, false, MPIHelper::getInstance().getProcessID());
         }
     }
-    
 
-    double parsTime = getRealTime() - startTime;
-    if (nParTrees > 0) {
-        cout << parsTime << " seconds ";
-        cout << candidateTrees.size() << " distinct starting trees" << endl;
-    }
+    if (nParTrees > 0)
+        cout << getRealTime() - startTime << " second" << endl;
 
     /****************************************************************************************
-                      Compute logl of all parsimony trees
+                          Compute logl of all initial trees
     *****************************************************************************************/
 
-    cout << "Computing log-likelihood of " << candidateTrees.size() << " initial trees ... ";
+    vector<string> initTreeStrings = candidateTrees.getBestTreeStrings();
+    candidateTrees.clear();
+
+    if (init_size < initTreeStrings.size())
+        cout << "Computing log-likelihood of " << initTreeStrings.size() - init_size << " initial trees ... ";
     startTime = getRealTime();
-//    CandidateSet candTrees = candidateTrees.getBestCandidateTrees(candidateTrees.size());
-    CandidateSet candTrees = candidateTrees;
 
-    for (CandidateSet::iterator it = candTrees.begin(); it != candTrees.end(); ++it) {
+    for (vector<string>::iterator it = initTreeStrings.begin(); it != initTreeStrings.end(); ++it) {
         string treeString;
         double score;
-        if (it->first == -DBL_MAX) {
-            readTreeString(it->second.tree);
+        readTreeString(*it);
+        if (it-initTreeStrings.begin() >= init_size)
             treeString = optimizeBranches(2);
-            score = getCurScore();
-        } else {
-            treeString = it->second.tree;
-            score = it->first;
+        else {
+            computeLogL();
+            treeString = getTreeString();
         }
-        candidateTrees.update(treeString, score);
+        score = getCurScore();
+        candidateTrees.update(treeString,score);
     }
-    
-    if (verbose_mode >= VB_MED) {
-        vector<double> bestScores = candidateTrees.getBestScores(candidateTrees.size());
-        for (vector<double>::iterator it = bestScores.begin(); it != bestScores.end(); it++)
-            cout << (*it) << " ";
-        cout << endl;
+
+    if (Params::getInstance().writeDistImdTrees)
+        intermediateTrees.initTrees(candidateTrees);
+
+    if (init_size < initTreeStrings.size())
+        cout << getRealTime() - startTime << " seconds" << endl;
+
+    if (nParTrees > 0) {
+        cout << "Current best score: " << candidateTrees.getBestScore() << endl;
     }
 
+/*
+    //---- NON-BLOCKING COMMUNICATION
+#ifdef _IQTREE_MPI
+    vector<string> trees;
+    vector<double> scores;
+    // FIX BUG: send candidateTrees instead of intermediateTrees
+    candidateTrees.getAllTrees(trees, scores, WT_TAXON_ID + WT_BR_LEN + WT_BR_LEN_SHORT);
+    // Send all trees to other processes
+    MPIHelper::getInstance().distributeTrees(trees, scores, TREE_TAG);
+
+    // Get trees from other nodes
+    cout << "Getting initial trees from other processes ... " << endl;
+    int maxNumTrees = (nParTrees + 2) * (MPIHelper::getInstance().getNumProcesses() - 1);
+    MPI_CollectTrees(true, maxNumTrees, false);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+*/
 
-    double loglTime = getRealTime() - startTime;
-    cout << loglTime << " seconds" << endl;
-}
+    //---- BLOCKING COMMUNICATION
+    syncCandidateTrees(nNNITrees, false);
 
-void IQTree::initCandidateTreeSet(int nParTrees, int nNNITrees) {
 
-    bool finishedInitTree = checkpoint->getBool("finishedInitTree");
+    vector<string> bestInitTrees; // Set of best initial trees for doing NNIs
 
-    if (finishedInitTree) {
-        cout << "CHECKPOINT: " << min(nParTrees, (int)candidateTrees.size()) << " initial trees restored" << endl;
-    } else {
-        createInitTrees(nParTrees);
-        checkpoint->putBool("finishedInitTree", true);
-        saveCheckpoint();
-        checkpoint->dump();
-    }
+    bestInitTrees = candidateTrees.getBestTreeStringsForProcess(nNNITrees);
 
-    // Only select the best nNNITrees for doing NNI search
-    CandidateSet initParsimonyTrees = candidateTrees.getBestCandidateTrees(nNNITrees);
+    cout << endl;
+    cout << "Do NNI search on " << bestInitTrees.size() << " best initial trees" << endl;
+    stop_rule.setCurIt(0);
     candidateTrees.clear();
+    candidateTrees.setMaxSize(Params::getInstance().numSupportTrees);
 
-    cout << "Optimizing top " << initParsimonyTrees.size() << " initial trees with NNI..." << endl;
-    double startTime = getCPUTime();
-    /*********** START: Do NNI on the best parsimony trees ************************************/
-    CandidateSet::reverse_iterator rit = initParsimonyTrees.rbegin();
-
-//    stop_rule.setCurIt(0);
-    if (stop_rule.getCurIt() > 0) {
-        int step = stop_rule.getCurIt();
-        for (; rit != initParsimonyTrees.rend() && step > 0; ++rit, step--) {
-            // increase iterator accordingly
-            candidateTrees.update(rit->second.tree, rit->first);
-        }
-        cout << "CHECKPOINT: " << stop_rule.getCurIt() << " initial iterations restored" << endl;
+    for (vector<string>::iterator it = bestInitTrees.begin(); it != bestInitTrees.end(); it++) {
+        readTreeString(*it);
+        doNNISearch();
+        string treeString = getTreeString();
+        addTreeToCandidateSet(treeString, curScore, true, MPIHelper::getInstance().getProcessID());
+        if (Params::getInstance().writeDistImdTrees)
+            intermediateTrees.update(treeString, curScore);
+//#ifdef _IQTREE_MPI
+//        MPIHelper::getInstance().distributeTree(getTreeString(), curScore, TREE_TAG);
+//        MPI_CollectTrees(false, maxNumTrees, true);
+//#endif
     }
-    for (; rit != initParsimonyTrees.rend(); ++rit) {
-        stop_rule.setCurIt(stop_rule.getCurIt() + 1);
-    	int nniCount, nniStep;
-        double initLogl, nniLogl;
-        string tree;
-        readTreeString(rit->second.tree);
-        computeLogL();
-//         THIS HAPPEN WHENEVER USING FULL PARTITION MODEL
-//        if (isSuperTree() && params->partition_type == 0) {
-//        	if (verbose_mode >= VB_MED)
-//        		cout << "curScore: " << getCurScore() << " expected score: " << rit->first << endl;
-//        	optimizeBranches(2);
-//        }
-        initLogl = getCurScore();
-        tree = doNNISearch(nniCount, nniStep);
-        nniLogl = getCurScore();
-        cout << "Iteration " << stop_rule.getCurIt() << " / LogL: " << getCurScore();
-        if (verbose_mode >= VB_MED) {
-        	cout << " / NNI count, steps: " << nniCount << "," << nniStep;
-        	cout << " / Parsimony logl " << initLogl << " / NNI logl: " << nniLogl;
-        }
-        cout << " / Time: " << convert_time(getRealTime() - params->start_real_time) << endl;
-
-        bool betterScore = false;
-        // Better tree or score is found
-        if (getCurScore() > candidateTrees.getBestScore() + params->modeps) {
-            // Re-optimize model parameters (the sNNI algorithm)
-        	tree = optimizeModelParameters(false, params->modeps * 10);
-            getModelFactory()->saveCheckpoint();
-        	betterScore = true;
-        }
-        bool newTree = candidateTrees.update(tree, getCurScore());
-		if (betterScore) {
-			if (newTree && nniCount != 0)
-				cout << "BETTER TREE FOUND at iteration " << stop_rule.getCurIt() << ": "
-						<< getCurScore() << endl;
-			else
-				cout << "BETTER SCORE FOUND at iteration " << stop_rule.getCurIt() << ": "
-						<< getCurScore() << endl;
-		}
-        saveCheckpoint();
-        checkpoint->dump();
-//        if (params.partition_type)
-//        	((PhyloSuperTreePlen*)&iqtree)->printNNIcasesNUM();
+
+    //---- BLOCKING COMMUNICATION
+    syncCandidateTrees(Params::getInstance().numSupportTrees, true);
+
+/*
+#ifdef _IQTREE_MPI
+    //------ NON-BLOCKING COMMUNICATION
+    // FIX BUG: send candidateTrees instead of intermediateTrees
+    candidateTrees.getAllTrees(trees, scores, WT_TAXON_ID + WT_BR_LEN + WT_BR_LEN_SHORT);
+    // Send all trees to other processes
+    MPIHelper::getInstance().distributeTrees(trees, scores, TREE_TAG);
+
+    // Get trees from other nodes
+    cout << "Getting top candidate trees from other processes ... " << endl;
+    MPI_CollectTrees(true, maxNumTrees, true);
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+*/
+
+// #ifdef _IQTREE_MPI
+//     // Send trees
+//     MPIHelper::getInstance().distributeTrees(nniTrees, nniScores, TREE_TAG);
+//     MPI_Barrier(MPI_COMM_WORLD);
+//     // Receive trees
+//     maxNumTrees = treesPerProc * (MPIHelper::getInstance().getNumProcesses() - 1);
+//     MPI_CollectTrees(true,maxNumTrees,true);
+// #endif
+//    if (params->fixStableSplits && candidateTrees.size() > 1) {
+//        candidateTrees.computeSplitOccurences(Params::getInstance().stableSplitThreshold, Params::getInstance().numSupportTrees);
+//    }
+}
+
+string IQTree::generateParsimonyTree(int randomSeed) {
+    string parsimonyTreeString;
+    if (params->start_tree == STT_PLL_PARSIMONY) {
+        pllInst->randomNumberSeed = randomSeed;
+        pllComputeRandomizedStepwiseAdditionParsimonyTree(pllInst,
+                                                          pllPartitions, params->sprDist);
+        resetBranches(pllInst);
+        pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions,
+                        pllInst->start->back, PLL_FALSE, PLL_TRUE, PLL_FALSE,
+                        PLL_FALSE, PLL_FALSE, PLL_SUMMARIZE_LH, PLL_FALSE, PLL_FALSE);
+        parsimonyTreeString = string(pllInst->tree_string);
+        PhyloTree::readTreeString(parsimonyTreeString);
+        wrapperFixNegativeBranch(true);
+        parsimonyTreeString = getTreeString();
+    } else if (params->start_tree == STT_RANDOM_TREE) {
+        generateRandomTree(YULE_HARDING);
+        wrapperFixNegativeBranch(true);
+        parsimonyTreeString = getTreeString();
+    } else {
+        computeParsimonyTree(NULL, aln);
+        parsimonyTreeString = getTreeString();
     }
-    double nniTime = getCPUTime() - startTime;
-    cout << "Average CPU time for 1 NNI search: " << nniTime / initParsimonyTrees.size() << endl;
+    return parsimonyTreeString;
+
 }
 
 void IQTree::initializePLL(Params &params) {
@@ -791,11 +877,11 @@ void IQTree::initializePLL(Params &params) {
     pllAttr.saveMemory = PLL_FALSE;
     pllAttr.useRecom = PLL_FALSE;
     pllAttr.randomNumberSeed = params.ran_seed;
-    pllAttr.numberOfThreads = params.num_threads; /* This only affects the pthreads version */
+    pllAttr.numberOfThreads = max(params.num_threads, 1); /* This only affects the pthreads version */
     if (pllInst != NULL) {
         pllDestroyInstance(pllInst);
     }
-    /* Create a PLL instance */
+    /* Create a PLL getInstance */
     pllInst = pllCreateInstance(&pllAttr);
 
     /* Read in the alignment file */
@@ -1291,68 +1377,249 @@ void IQTree::doParsimonyReinsertion() {
     fixNegativeBranch(false);
 }
 
-
-int IQTree::removeBranches(NodeVector& nodes1, NodeVector& nodes2, SplitGraph& splits) {
-	if (splits.size() == 0)
-		return 0;
-	NodeVector _nodes1, _nodes2;
-	NodeVector::iterator it1, it2;
-	_nodes1 = nodes1;
-	_nodes2 = nodes2;
-	nodes1.clear();
-	nodes2.clear();
-	for (it1 = _nodes1.begin(), it2 = _nodes2.begin(); it1 != _nodes1.end() && it2 != _nodes2.end(); it1++, it2++) {
-		Split* sp = getSplit(*it1, *it2);
-		if (!splits.containSplit(*sp)) {
-			nodes1.push_back(*it1);
-			nodes2.push_back(*it2);
+void IQTree::getNonTabuBranches(Branches& allBranches, SplitGraph& tabuSplits, Branches& nonTabuBranches, Branches* tabuBranches) {
+    if (tabuSplits.size() == 0) {
+    	return;
+    }
+	for (Branches::iterator it = allBranches.begin(); it != allBranches.end(); it++) {
+		if (isInnerBranch(it->second.first, it->second.second)) {
+            int nodeID1 = it->second.first->id;
+            int nodeID2 = it->second.second->id;
+            Branch curBranch = it->second;
+            Split* sp = getSplit(it->second.first, it->second.second);
+			if (!tabuSplits.containSplit(*sp)) {
+                nonTabuBranches.insert(pair<int,Branch>(pairInteger(nodeID1, nodeID2), curBranch));
+			} else {
+				if (tabuBranches != NULL) {
+					tabuBranches->insert(pair<int,Branch>(pairInteger(nodeID1, nodeID2), curBranch));
+				}
+			}
+			delete sp;
 		}
-		delete sp;
+
 	}
-	return (_nodes1.size() - nodes1.size());
 }
 
-void IQTree::doRandomNNIs(int numNNI) {
-	NodeVector nodes1, nodes2;
-	//SplitGraph usedSplits;
-	NodeVector::iterator it1, it2;
-    int cntNNI = 0;
-    while (cntNNI < numNNI) {
-    	nodes1.clear();
-    	nodes2.clear();
-		getAllInnerBranches(nodes1, nodes2, &candidateTrees.getStableSplits());
-    	// remove all used splits
-		//removeBranches(nodes1, nodes2, usedSplits);
-		if (nodes1.size() == 0) {
-			assert(nodes2.size() == 0);
-			break;
-		}
-    	// randomly take an inner branch and do a random NNI
-        int index = random_int(nodes1.size());
-        doOneRandomNNI(nodes1[index], nodes2[index]);
-//        if (params->fix_stable_splits) {
-//            Split* newSp = getSplit(nodes1[index], nodes2[index]);
-//            usedSplits.push_back(newSp);
-//        }
-    	cntNNI++;
+void IQTree::getSplitBranches(Branches &branches, SplitIntMap &splits, Node *node, Node *dad) {
+    if (!node) {
+        node = root;
+    }
+    FOR_NEIGHBOR_IT(node, dad, it) {
+            if (isInnerBranch((*it)->node, node)) {
+                Branch curBranch;
+                curBranch.first = (*it)->node;
+                curBranch.second = node;
+                Split* curSplit;
+                Split *sp = (*it)->split;
+                assert(sp != NULL);
+                curSplit = new Split(*sp);
+                if (curSplit->shouldInvert())
+                    curSplit->invert();
+                if (splits.findSplit(curSplit) != NULL) {
+                    //curSplit->report(cout);
+                    branches.insert(pair<int,Branch>(pairInteger(curBranch.first->id, curBranch.second->id), curBranch));
+                }
+                delete curSplit;
+            }
+            getSplitBranches(branches, splits, (*it)->node, node);
+        }
+}
+
+bool IQTree::shouldEvaluate(Split *curSplit, SplitIntMap &tabuSplits, SplitIntMap &candSplits) {
+    bool answer = true;
+    /******************** CHECK TABU SPLIT **************************/
+    if (tabuSplits.findSplit(curSplit) != NULL) {
+        answer = false;
+    } else if (!candSplits.empty()) {
+        Split *_curSplit;
+        /******************** CHECK STABLE SPLIT **************************/
+        int value;
+        _curSplit = candSplits.findSplit(curSplit, value);
+        if (_curSplit == NULL || _curSplit->getWeight() <= params->stableSplitThreshold) {
+            answer = true;
+        } else { // add a stable branch with a certain probability
+            double rndDbl = random_double();
+            if (rndDbl > params->stableSplitThreshold) {
+                answer = true;
+            } else {
+                answer = false;
+            }
+        }
+    } else {
+        answer = true;
+    }
+    return answer;
+}
+
+
+void IQTree::getNNIBranches(SplitIntMap &tabuSplits, SplitIntMap &candSplits,Branches &nonNNIBranches, Branches &nniBranches, Node *node, Node *dad) {
+    if (!node) {
+        node = root;
+    }
+    FOR_NEIGHBOR_IT(node, dad, it) {
+            if (isInnerBranch((*it)->node, node)) {
+                Branch curBranch;
+                curBranch.first = (*it)->node;
+                curBranch.second = node;
+                int branchID = pairInteger(curBranch.first->id, curBranch.second->id);
+
+                if (params->fixStableSplits) {
+                    Split *curSplit;
+                    Split *sp = (*it)->split;
+                    assert(sp != NULL);
+                    curSplit = new Split(*sp);
+                    if (curSplit->shouldInvert())
+                        curSplit->invert();
+                    if (shouldEvaluate(curSplit, tabuSplits, candSplits)) {
+                        nniBranches.insert(pair<int, Branch>(branchID, curBranch));
+                    } else {
+                        nonNNIBranches.insert(pair<int, Branch>(branchID, curBranch));
+                    }
+                    delete curSplit;
+                } else {
+                    nniBranches.insert(pair<int, Branch>(branchID, curBranch));
+                }
+            }
+            getNNIBranches(tabuSplits, candSplits, nonNNIBranches, nniBranches, (*it)->node, node);
+        }
+}
+
+void IQTree::getStableBranches(SplitIntMap &candSplits, double supportValue, Branches &stableBranches, Node *node, Node *dad) {
+    if (!node) {
+        node = root;
+    }
+
+    FOR_NEIGHBOR_IT(node, dad, it) {
+            if (isInnerBranch((*it)->node, node)) {
+                Branch curBranch;
+                curBranch.first = (*it)->node;
+                curBranch.second = node;
+                Split *curSplit;
+                Split *sp = (*it)->split;
+                assert(sp != NULL);
+                curSplit = new Split(*sp);
+                if (curSplit->shouldInvert())
+                    curSplit->invert();
+                int occurences;
+                sp = candSplits.findSplit(curSplit, occurences);
+                if (sp != NULL) {
+                    if ( sp->getWeight() >= supportValue) {
+                        stableBranches.insert(
+                                pair<int, Branch>(pairInteger(curBranch.first->id, curBranch.second->id), curBranch));
+                    }
+                }
+                delete curSplit;
+            }
+            getStableBranches(candSplits, supportValue, stableBranches, (*it)->node, node);
+        }
+}
+
+string IQTree::perturbStableSplits(double suppValue) {
+    int numRandNNI = 0;
+    Branches stableBranches;
+//    initTabuSplits.clear();
+//    stableBranches = getStableBranches(candidateTrees.getCandSplits(), suppValue);
+//    int maxRandNNI = stableBranches.size() / 2;
+    do {
+        getStableBranches(candidateTrees.getCandSplits(), suppValue, stableBranches);
+        vector<NNIMove> randomNNIs;
+        vector<NNIMove> compatibleNNIs;
+        for (map<int, Branch>::iterator it = stableBranches.begin(); it != stableBranches.end(); it++) {
+            NNIMove randNNI = getRandomNNI(it->second);
+            if (constraintTree.isCompatible(randNNI))
+                randomNNIs.push_back(randNNI);
+        }
+        getCompatibleNNIs(randomNNIs, compatibleNNIs);
+        for (vector<NNIMove>::iterator it = compatibleNNIs.begin(); it != compatibleNNIs.end(); it++) {
+            doNNI(*it);
+            numRandNNI++;
+//            Split *sp = getSplit(it->node1, it->node2);
+//            Split *tabuSplit = new Split(*sp);
+//            if (tabuSplit->shouldInvert()) {
+//                tabuSplit->invert();
+//            }
+//            initTabuSplits.insertSplit(tabuSplit, 1);
+        }
+    } while (stableBranches.size() > 0);
+
+    if (verbose_mode >= VB_MAX) {
+        cout << "Tree perturbation: number of random NNI performed = " << numRandNNI << endl;
     }
-	//cout << "Number of random NNI performed: " << cntNNI << endl;
     setAlignment(aln);
     setRootNode(params->root);
 
+    clearAllPartialLH();
+
     if (isSuperTree()) {
         ((PhyloSuperTree*) this)->mapTrees();
     }
+    if (params->pll) {
+        pllReadNewick(getTreeString());
+    }
+
+    resetCurScore();
+    return getTreeString();
+}
+
+string IQTree::doRandomNNIs(bool storeTabu) {
+    int cntNNI = 0;
+    int numRandomNNI;
+    Branches nniBranches;
+    Branches nonNNIBranches;
+    if (storeTabu) {
+        Branches stableBranches;
+        getStableBranches(candidateTrees.getCandSplits(), Params::getInstance().stableSplitThreshold, stableBranches);
+        int numNonStableBranches = leafNum - 3 - stableBranches.size();
+        numRandomNNI = numNonStableBranches;
+    } else {
+        numRandomNNI = floor((leafNum - 3) * Params::getInstance().initPS);
+    }
+
+    initTabuSplits.clear();
+    while (cntNNI < numRandomNNI) {
+        nniBranches.clear();
+        nonNNIBranches.clear();
+        getNNIBranches(initTabuSplits, candidateTrees.getCandSplits(), nonNNIBranches, nniBranches);
+        if (nniBranches.size() == 0) break;
+        // Convert the map data structure Branches to vector of Branch
+        vector<Branch> vectorNNIBranches;
+        for (Branches::iterator it = nniBranches.begin(); it != nniBranches.end(); ++it) {
+            vectorNNIBranches.push_back(it->second);
+        }
+        int randInt = random_int((int) vectorNNIBranches.size());
+        NNIMove randNNI = getRandomNNI(vectorNNIBranches[randInt]);
+        if (constraintTree.isCompatible(randNNI)) {
+            // only if random NNI satisfies constraintTree
+            doNNI(randNNI);
+            if (storeTabu) {
+                Split *sp = getSplit(randNNI.node1, randNNI.node2);
+                Split *tabuSplit = new Split(*sp);
+                if (tabuSplit->shouldInvert()) {
+                    tabuSplit->invert();
+                }
+                initTabuSplits.insertSplit(tabuSplit, 1);
+            }
+        }
+        cntNNI++;
+    }
+    if (verbose_mode >= VB_MAX)
+	    cout << "Tree perturbation: number of random NNI performed = " << cntNNI << endl;
+    setAlignment(aln);
+    setRootNode(params->root);
 
+    if (isSuperTree()) {
+        ((PhyloSuperTree*) this)->mapTrees();
+    }
     if (params->pll) {
     	pllReadNewick(getTreeString());
     }
 
+    clearAllPartialLH();
     resetCurScore();
+    return getTreeString();
 }
 
 
-
 void IQTree::doIQP() {
     if (verbose_mode >= VB_DEBUG)
         drawTree(cout, WT_BR_SCALE | WT_INT_NODE | WT_TAXON_ID | WT_NEWLINE | WT_BR_ID);
@@ -1732,9 +1999,7 @@ extern pllUFBootData * pllUFBootDataPtr;
 
 string IQTree::optimizeModelParameters(bool printInfo, double logl_epsilon) {
 	if (logl_epsilon == -1)
-		logl_epsilon = params->modeps;
-//    if (params->opt_gammai)
-//        logl_epsilon = 0.1;
+		logl_epsilon = params->modelEps;
     cout << "Estimate model parameters (epsilon = " << logl_epsilon << ")" << endl;
 	double stime = getRealTime();
 	string newTree;
@@ -1791,14 +2056,14 @@ string IQTree::optimizeModelParameters(bool printInfo, double logl_epsilon) {
 	return newTree;
 }
 
-void IQTree::printBestScores(int numBestScore) {
+void IQTree::printBestScores() {
 	vector<double> bestScores = candidateTrees.getBestScores(params->popSize);
 	for (vector<double>::iterator it = bestScores.begin(); it != bestScores.end(); it++)
 		cout << (*it) << " ";
 	cout << endl;
 }
 
-void IQTree::computeLogL() {
+double IQTree::computeLogL() {
 	if (params->pll) {
 		if (curScore == -DBL_MAX) {
 			pllEvaluateLikelihood(pllInst, pllPartitions, pllInst->start, PLL_TRUE, PLL_FALSE);
@@ -1813,7 +2078,7 @@ void IQTree::computeLogL() {
 //		}
 		curScore = computeLikelihood();
 	}
-//	lhComputed = true;
+	return curScore;
 }
 
 string IQTree::optimizeBranches(int maxTraversal) {
@@ -1834,83 +2099,167 @@ string IQTree::optimizeBranches(int maxTraversal) {
 //            clearAllPartialLH();
 //            lhComputed = true;
 //    	}
-    	curScore = optimizeAllBranches(maxTraversal);
+        curScore = optimizeAllBranches(maxTraversal, params->loglh_epsilon, PLL_NEWZPERCYCLE);
         tree = getTreeString();
     }
     return tree;
 }
 
+void IQTree::collectBootTrees() {
+#ifdef _IQTREE_MPI
+	if (boot_trees.size() == 0)
+			return;
+    // send UFBoot trees between processes
+    if (MPIHelper::getInstance().isMaster()) {
+        MPIHelper::getInstance().sendMsg(BOOT_TAG, "BOOT TREES PLEASE!");
+        TreeCollection trees;
+        int count = 0;
+        do {
+            int source = MPIHelper::getInstance().receiveTrees(trees, BOOT_TREE_TAG);
+            if (source > 0) {
+                count++;
+                assert(trees.getNumTrees() == boot_trees.size());
+                int better_trees = 0;
+                for (int id = 0; id < trees.getNumTrees(); id++)
+                    if (trees.getScores()[id] > boot_logl[id]) {
+                        boot_trees[id] = trees.getTreeStrings()[id];
+                        boot_logl[id] = trees.getScores()[id];
+                        better_trees++; 
+                    }
+                trees.clear();
+                cout << better_trees << " better bootstrap trees from process " << source << endl;
+            }
+        } while (count < MPIHelper::getInstance().getNumProcesses()-1);
+    } else {
+        // worker
+        if (MPIHelper::getInstance().checkMsg(BOOT_TAG))
+            MPIHelper::getInstance().sendTrees(PROC_MASTER, boot_trees, boot_logl, BOOT_TREE_TAG);
+        string msg;
+        if (MPIHelper::getInstance().checkMsg(LOGL_CUTOFF_TAG, msg)) {
+            logl_cutoff = convert_double(msg.c_str());
+            cout << "Log-likelihood cutoff on original alignment: " << logl_cutoff << endl;            
+        }
+    }
+#endif
+}
+
 double IQTree::doTreeSearch() {
     cout << "--------------------------------------------------------------------" << endl;
-    cout << "|               OPTIMIZING CANDIDATE TREE SET                      |" << endl;
+    cout << "|             INITIALIZING CANDIDATE TREE SET                      |" << endl;
     cout << "--------------------------------------------------------------------" << endl;
-    // PLEASE PRINT TREE HERE!
-    printResultTree();
-    string treels_name = params->out_prefix;
-    treels_name += ".treels";
-    string out_lh_file = params->out_prefix;
-    out_lh_file += ".treelh";
-    string site_lh_file = params->out_prefix;
-    site_lh_file += ".sitelh";
 
-    if (params->print_tree_lh) {
-        out_treelh.open(out_lh_file.c_str());
-        out_sitelh.open(site_lh_file.c_str());
+    double initCPUTime = getRealTime();
+    int treesPerProc = (params->numInitTrees) / MPIHelper::getInstance().getNumProcesses() - candidateTrees.size();
+    if (params->numInitTrees % MPIHelper::getInstance().getNumProcesses() != 0) {
+        treesPerProc++;
     }
+    if (treesPerProc < 0)
+        treesPerProc = 0;
+    // Master node does one tree less because it already created the BIONJ tree
+//    if (MPIHelper::getInstance().isMaster()) {
+//        treesPerProc--;
+//    }
 
-    if (params->write_intermediate_trees)
-        out_treels.open(treels_name.c_str());
+    // Make sure to get at least 1 tree
+    if (treesPerProc < 1 && params->numInitTrees > candidateTrees.size())
+        treesPerProc = 1;
 
-    if (params->write_intermediate_trees && save_all_trees != 2) {
-        printIntermediateTree(WT_NEWLINE | WT_APPEND | WT_SORT_TAXA | WT_BR_LEN);
-    }
+    /* Initialize candidate tree set */
+    if (!getCheckpoint()->getBool("finishedCandidateSet")) {
+        initCandidateTreeSet(treesPerProc, params->numNNITrees);
+        // write best tree to disk
+        printBestCandidateTree();
+        saveCheckpoint();
+        getCheckpoint()->putBool("finishedCandidateSet", true);
+        getCheckpoint()->dump(true);
+    } else {
+        cout << "CHECKPOINT: Candidate tree set restored, best LogL: " << candidateTrees.getBestScore() << endl;
+    }
+    assert(candidateTrees.size() != 0);
+    cout << "Finish initializing candidate tree set (" << candidateTrees.size() << ")" << endl;
+
+
+    cout << "Current best tree score: " << candidateTrees.getBestScore() << " / CPU time: " <<
+    getRealTime() - initCPUTime << endl;
+    cout << "Number of iterations: " << stop_rule.getCurIt() << endl;
+
+//    string treels_name = params->out_prefix;
+//    treels_name += ".treels";
+//    string out_lh_file = params->out_prefix;
+//    out_lh_file += ".treelh";
+//    string site_lh_file = params->out_prefix;
+//    site_lh_file += ".sitelh";
+//
+//    if (params->print_tree_lh) {
+//        out_treelh.open(out_lh_file.c_str());
+//        out_sitelh.open(site_lh_file.c_str());
+//    }
+
+//    if (params->write_intermediate_trees)
+//        out_treels.open(treels_name.c_str());
+
+//    if (params->write_intermediate_trees && save_all_trees != 2) {
+//        printIntermediateTree(WT_NEWLINE | WT_APPEND | WT_SORT_TAXA | WT_BR_LEN);
+//    }
 
     setRootNode(params->root);
-    // keep the best tree into a string
-    //stringstream bestTreeStream;
-    //stringstream bestTopoStream;
-//    string perturb_tree_string;
-    string imd_tree;
-    //printTree(bestTreeStream, WT_TAXON_ID + WT_BR_LEN);
-    //printTree(bestTopoStream, WT_TAXON_ID + WT_SORT_TAXA);
-    //string best_tree_topo = bestTopoStream.str();
-
-    // if not zero, it means already recovered from checkpoint
-    if (stop_rule.getLastImprovedIteration() == 0)
-    	stop_rule.addImprovedIteration(1);
-    else
-    	cout << "CHECKPOINT: " <<  stop_rule.getCurIt() << " search iterations restored" << endl;
+
+    if (!getCheckpoint()->getBool("finishedCandidateSet"))
+        cout << "CHECKPOINT: " << stop_rule.getCurIt() << " search iterations restored" << endl;
+
     searchinfo.curPerStrength = params->initPS;
+    double cur_correlation = 0.0;
 
-	double cur_correlation = 0.0;
 
-	/*====================================================
-	 * MAIN LOOP OF THE IQ-TREE ALGORITHM
-	 *====================================================*/
-    while(!stop_rule.meetStopCondition(stop_rule.getCurIt(), cur_correlation)) {
-        stop_rule.setCurIt(stop_rule.getCurIt() + 1);
+    if ((Params::getInstance().fixStableSplits || Params::getInstance().adaptPertubation) && candidateTrees.size() > 1) {
+        candidateTrees.computeSplitOccurences(Params::getInstance().stableSplitThreshold);
+    }
+
+    // tracking of worker candidate set is changed from master candidate set
+    candidateset_changed.resize(MPIHelper::getInstance().getNumProcesses(), false);
+    bestcandidate_changed = false;
+
+    /*==============================================================================================================
+	                                       MAIN LOOP OF THE IQ-TREE ALGORITHM
+	 *=============================================================================================================*/
+
+    bool optimization_looped = false;
+    if (!stop_rule.meetStopCondition(stop_rule.getCurIt(), cur_correlation)) {
+        cout << "--------------------------------------------------------------------" << endl;
+        cout << "|               OPTIMIZING CANDIDATE TREE SET                      |" << endl;
+        cout << "--------------------------------------------------------------------" << endl;
+        optimization_looped = true;
+    }
+
+    // count threshold for computing bootstrap correlation
+    int ufboot_count, ufboot_count_check;
+    stop_rule.getUFBootCountCheck(ufboot_count, ufboot_count_check);
+
+    while (!stop_rule.meetStopCondition(stop_rule.getCurIt(), cur_correlation)) {
+
+/*
+#ifdef _IQTREE_MPI
+        // check stopping rule
+        if (MPIHelper::getInstance().isMaster()) { 
+            if (stop_rule.meetStopCondition(stop_rule.getCurIt(), cur_correlation)) {
+                MPIHelper::getInstance().sendMsg(STOP_TAG, "STOP!");
+                break;
+            }
+        } else {
+            if(MPIHelper::getInstance().checkMsg(STOP_TAG)) {
+                break;
+            }
+        }     
+#else         
+        if (stop_rule.meetStopCondition(stop_rule.getCurIt(), cur_correlation))
+            break;
+#endif
+*/
         searchinfo.curIter = stop_rule.getCurIt();
         // estimate logl_cutoff for bootstrap
         if (!boot_orig_logl.empty())
             logl_cutoff = *min_element(boot_orig_logl.begin(), boot_orig_logl.end());
 
-//        if (/*params->avoid_duplicated_trees && max_candidate_trees > 0 &&*/ stop_rule.getCurIt() > 2 /* && treels_logl.size() > 1000*/) {
-//        	int predicted_iteration = ((stop_rule.getCurIt()+params->step_iterations-1)/params->step_iterations)*params->step_iterations;
-//            int num_entries = floor(max_candidate_trees * ((double) stop_rule.getCurIt() / predicted_iteration));
-//            if (num_entries < treels_logl.size() * 0.9) {
-//                DoubleVector logl = treels_logl;
-//                nth_element(logl.begin(), logl.begin() + (treels_logl.size() - num_entries), logl.end());
-//                logl_cutoff = logl[treels_logl.size() - num_entries] - 1.0;
-//            } else
-//                logl_cutoff = 0.0;
-//            if (verbose_mode >= VB_MED) {
-//                if (stop_rule.getCurIt() % 10 == 0) {
-//                    cout << treels_logl.size() << " logls, logl_cutoff= " << logl_cutoff;
-//                        cout << endl;
-//                }
-//            }
-//        }
-
         if (estimate_nni_cutoff && nni_info.size() >= 500) {
             estimate_nni_cutoff = false;
             estimateNNICutoff(params);
@@ -1918,73 +2267,51 @@ double IQTree::doTreeSearch() {
 
         Alignment *saved_aln = aln;
 
-    	/*----------------------------------------
-    	 * Perturb the tree
-    	 *---------------------------------------*/
-        double perturbScore = 0.0;
-        int numStableBranches = aln->getNSeq() - 3 - candidateTrees.getStableSplits().size();
-        // Change from floor to ceil to make sure perturbing at least 1 branch
-        int numPerturb = ceil(searchinfo.curPerStrength * numStableBranches);
-        bool treechanged = false;
-        if (iqp_assess_quartet == IQP_BOOTSTRAP) {
-            // create bootstrap sample
-            Alignment* bootstrap_alignment;
-            if (aln->isSuperAlignment())
-                bootstrap_alignment = new SuperAlignment;
-            else
-                bootstrap_alignment = new Alignment;
-            bootstrap_alignment->createBootstrapAlignment(aln, NULL, params->bootstrap_spec);
-            setAlignment(bootstrap_alignment);
-            initializeAllPartialLh();
-            clearAllPartialLH();
-            curScore = optimizeAllBranches();
-        } else {
-            if (params->snni) {
-//                string candidateTree = candidateTrees.getRandCandTree();
-//                readTreeString(candidateTree);
-                readTreeString(candidateTrees.getRandCandTree());
-//                if (params->fix_stable_splits)
-//                	assert(containsSplits(candidateTrees.getStableSplits()));
-                if (params->iqp) {
-                    doIQP();
-                } else {
-                    doRandomNNIs(numPerturb);
-                }
-            } else {
-            	readTreeString(candidateTrees.getBestTrees()[0]);
-                doIQP();
-            }
-//            perturb_tree_string = getTreeString();
-            if (params->count_trees) {
-                string perturb_tree_topo = getTopology();
-                if (pllTreeCounter.find(perturb_tree_topo) == pllTreeCounter.end()) {
-                    // not found in hash_map
-                    pllTreeCounter[perturb_tree_topo] = 1;
-                } else {
-                    // found in hash_map
-                    pllTreeCounter[perturb_tree_topo]++;
-                }
-            }
+        string curTree;
+        /*----------------------------------------
+         * Perturb the tree
+         *---------------------------------------*/
+        doTreePerturbation();
+
+        /*----------------------------------------
+         * Optimize tree with NNI
+         *----------------------------------------*/
+        pair<int, int> nniInfos; // <num_NNIs, num_steps>
+        nniInfos = doNNISearch();
+        curTree = getTreeString();
+        int pos = addTreeToCandidateSet(curTree, curScore, true, MPIHelper::getInstance().getProcessID());
+        if (pos != -2 && pos != -1 && (Params::getInstance().fixStableSplits || Params::getInstance().adaptPertubation))
+            candidateTrees.computeSplitOccurences(Params::getInstance().stableSplitThreshold);
+
+        if (MPIHelper::getInstance().isWorker() || MPIHelper::getInstance().gotMessage())
+            syncCurrentTree();
 
-            double oldScore = curScore;
-            computeLogL();
-            perturbScore = curScore;
-            if (perturbScore < oldScore - 0.01)
-                treechanged = true;
+/*
+#ifdef _IQTREE_MPI
+        //----------- NON-BLOCKING COMMUNICATION ---------//
+        int maxNumTrees = (MPIHelper::getInstance().getNumProcesses() - 1) * 2;
+        if (MPIHelper::getInstance().isMaster()) {
+            // master: receive tree from WORKERS
+            bool candidateset_changed = MPI_CollectTrees(false, maxNumTrees, true);
+            if (candidateset_changed) {
+                vector<string> bestTrees = candidateTrees.getBestTreeStrings(Params::getInstance().popSize);
+                vector<double> bestScores = candidateTrees.getBestScores(Params::getInstance().popSize);
+                MPIHelper::getInstance().distributeTrees(bestTrees, bestScores, TREE_TAG);
+            }
+        } else {
+            // worker: always send tree to MASTER
+            MPIHelper::getInstance().sendTree(PROC_MASTER, getTreeString(), curScore, TREE_TAG);
+            MPI_CollectTrees(false, maxNumTrees, true);
         }
+#endif
+*/
 
-    	/*----------------------------------------
-    	 * Optimize tree with NNI
-    	 *---------------------------------------*/
-        int nni_count = 0;
-        int nni_steps = 0;
-
-        imd_tree = doNNISearch(nni_count, nni_steps);
-        
-        if (nni_count == 0 && params->snni && numPerturb > 0 && treechanged) {
-            assert(0 && "BUG: NNI could not improved perturbed tree");
-        }
 
+        // TODO: cannot check yet, need to somehow return treechanged
+//        if (nni_count == 0 && params->snni && numPerturb > 0 && treechanged) {
+//            assert(0 && "BUG: NNI could not improved perturbed tree");
+//        }
+//
         if (iqp_assess_quartet == IQP_BOOTSTRAP) {
             // restore alignment
             delete aln;
@@ -1994,107 +2321,83 @@ double IQTree::doTreeSearch() {
         }
 
         if (isSuperTree()) {
-            ((PhyloSuperTree*) this)->computeBranchLengths();
+            ((PhyloSuperTree *) this)->computeBranchLengths();
         }
 
-    	/*----------------------------------------
+        /*----------------------------------------
     	 * Print information
     	 *---------------------------------------*/
-        double realtime_remaining = stop_rule.getRemainingTime(stop_rule.getCurIt(), cur_correlation);
-        cout.setf(ios::fixed, ios::floatfield);
-
-        // only print every 10th iteration or high verbose mode
-        if (stop_rule.getCurIt() % 10 == 0 || verbose_mode >= VB_MED) {
-			cout << ((iqp_assess_quartet == IQP_BOOTSTRAP) ? "Bootstrap " : "Iteration ") << stop_rule.getCurIt() << " / LogL: ";
-			if (verbose_mode >= VB_MED)
-				cout << perturbScore << " -> ";
-			cout << curScore;
-			if (verbose_mode >= VB_MED)
-				cout << " / (NNIs, Steps): (" << nni_count << "," << nni_steps << ")";
-			cout << " / Time: " << convert_time(getRealTime() - params->start_real_time);
-
-			if (stop_rule.getCurIt() > 10) {
-				cout << " (" << convert_time(realtime_remaining) << " left)";
-			}
-			cout << endl;
-        }
+        //printInterationInfo();
 
-        if (params->write_intermediate_trees && save_all_trees != 2) {
-            printIntermediateTree(WT_NEWLINE | WT_APPEND | WT_SORT_TAXA | WT_BR_LEN);
-        }
+//        if (params->write_intermediate_trees && save_all_trees != 2) {
+//            printIntermediateTree(WT_NEWLINE | WT_APPEND | WT_SORT_TAXA | WT_BR_LEN);
+//        }
 
-    	/*----------------------------------------
-    	 * Update if better tree is found
-    	 *---------------------------------------*/
-        if (curScore > candidateTrees.getBestScore() + params->modeps) {
-        	if (params->snni) {
-        		imd_tree = optimizeModelParameters();
-                getModelFactory()->saveCheckpoint();
-        	}
-            if (!candidateTrees.treeExist(imd_tree)) {
-                stop_rule.addImprovedIteration(stop_rule.getCurIt());
-                cout << "BETTER TREE FOUND at iteration " << stop_rule.getCurIt() << ": " << curScore << endl;
-            } else {
-                cout << "UPDATE BEST LOG-LIKELIHOOD: " << curScore << endl;
-            }
-            printResultTree();
+        if (params->snni && verbose_mode >= VB_DEBUG) {
+            printBestScores();
         }
 
-    	candidateTrees.update(imd_tree, curScore);
-    	if (params->snni && verbose_mode >= VB_MED) {
-        	printBestScores(params->popSize);
-    	}
-
         // DTH: make pllUFBootData usable in summarizeBootstrap
-        if(params->pll && params->online_bootstrap && (params->gbo_replicates > 0))
+        if (params->pll && params->online_bootstrap && (params->gbo_replicates > 0))
             pllConvertUFBootData2IQTree();
         // DTH: Carefully watch the -pll case here
 
-
-    	/*----------------------------------------
-    	 * convergence criterion for ultrafast bootstrap
-    	 *---------------------------------------*/
-        if ((stop_rule.getCurIt()) % (params->step_iterations / 2) == 0 && params->stop_condition == SC_BOOTSTRAP_CORRELATION) {
-        	// compute split support every half step
+        /*----------------------------------------
+         * convergence criterion for ultrafast bootstrap
+         *---------------------------------------*/
+         
+        // workers send bootstrap trees, TODO: blocking communication
+//        if (params->stop_condition == SC_BOOTSTRAP_CORRELATION && MPIHelper::getInstance().isWorker())
+//            collectBootTrees();
+
+        // MASTER receives bootstrap trees and perform stop convergence test 
+        if ((stop_rule.getCurIt()) >= ufboot_count &&
+            params->stop_condition == SC_BOOTSTRAP_CORRELATION && MPIHelper::getInstance().isMaster()) {
+//            collectBootTrees();
+            ufboot_count += params->step_iterations/2;
+            // compute split support every half step
             SplitGraph *sg = new SplitGraph;
             summarizeBootstrap(*sg);
             sg->removeTrivialSplits();
             sg->setCheckpoint(checkpoint);
             boot_splits.push_back(sg);
-//            if (params->max_candidate_trees == 0)
-//                max_candidate_trees = treels_logl.size() * (stop_rule.getCurIt() + (params->step_iterations / 2)) /
-//                                                           stop_rule.getCurIt();
-//			cout << "NOTE: " << treels_logl.size() << " bootstrap candidate trees evaluated (logl-cutoff: " << logl_cutoff << ")" << endl;
-			cout << "Log-likelihood cutoff on original alignment: " << logl_cutoff << endl;
-
-			// check convergence every full step
-			if (stop_rule.getCurIt() % params->step_iterations == 0) {
-	        	cur_correlation = computeBootstrapCorrelation();
-	            cout << "NOTE: Bootstrap correlation coefficient of split occurrence frequencies: " << cur_correlation << endl;
-	            if (!stop_rule.meetStopCondition(stop_rule.getCurIt(), cur_correlation)) {
-//	                if (params->max_candidate_trees == 0) {
-//	                    max_candidate_trees = treels_logl.size() * (stop_rule.getCurIt() + params->step_iterations) /
-//                                                                   stop_rule.getCurIt();
-//	                }
-//	                cout << "INFO: UFBoot does not converge, continue " << params->step_iterations << " more iterations" << endl;
-	            }
-	        }
+            cout << "Log-likelihood cutoff on original alignment: " << logl_cutoff << endl;
+//            MPIHelper::getInstance().sendMsg(LOGL_CUTOFF_TAG, convertDoubleToString(logl_cutoff));
+
+            // check convergence every full step
+            if (stop_rule.getCurIt() >= ufboot_count_check) {
+                ufboot_count_check += params->step_iterations;
+                cur_correlation = computeBootstrapCorrelation();
+                cout << "NOTE: Bootstrap correlation coefficient of split occurrence frequencies: " <<
+                cur_correlation << endl;
+                if (!stop_rule.meetCorrelation(cur_correlation)) {
+	                cout << "NOTE: UFBoot does not converge, continue at least " << params->step_iterations << " more iterations" << endl;
+                }
+            }
+            if (params->gbo_replicates && params->online_bootstrap && params->print_ufboot_trees)
+                writeUFBootTrees(*params);
+
         } // end of bootstrap convergence test
 
         // print UFBoot trees every 10 iterations
-		if (params->gbo_replicates && params->online_bootstrap && params->print_ufboot_trees &&
-                                                                  stop_rule.getCurIt() % 10 == 0)
-				writeUFBootTrees(*params);
 
         saveCheckpoint();
         checkpoint->dump();
-        
-       //if (params->partition_type)
-       // 	((PhyloSuperTreePlen*)this)->printNNIcasesNUM();
-       
+
+        if (bestcandidate_changed) {
+            printBestCandidateTree();
+            bestcandidate_changed = false;
+        }
+
+        //if (params->partition_type)
+        // 	((PhyloSuperTreePlen*)this)->printNNIcasesNUM();
+
     }
 
-    readTreeString(candidateTrees.getTopTrees()[0]);
+    if (optimization_looped)
+        sendStopMessage();
+
+    readTreeString(candidateTrees.getBestTreeStrings()[0]);
 
     if (testNNI)
         outNNI.close();
@@ -2106,220 +2409,370 @@ double IQTree::doTreeSearch() {
     }
 
     // DTH: pllUFBoot deallocation
-    if(params->pll) {
+    if (params->pll) {
         pllDestroyUFBootData();
     }
 
+#ifdef _IQTREE_MPI
+    cout << "Total number of trees received: " << MPIHelper::getInstance().getNumTreeReceived() << endl;
+    cout << "Total number of trees sent: " << MPIHelper::getInstance().getNumTreeSent() << endl;
+    cout << "Total number of NNI searches done by myself: " << MPIHelper::getInstance().getNumNNISearch() << endl;
+    MPIHelper::getInstance().resetNumbers();
+//    MPI_Finalize();
+//    if (MPIHelper::getInstance().getProcessID() != MASTER) {
+//        exit(0);
+//    }
+#endif
+
+
     return candidateTrees.getBestScore();
+
+}
+
+void IQTree::printIterationInfo(int sourceProcID) {
+    double realtime_remaining = stop_rule.getRemainingTime(stop_rule.getCurIt());
+    cout.setf(ios_base::fixed, ios_base::floatfield);
+
+    // only print every 10th iteration or high verbose mode
+    if (stop_rule.getCurIt() % 10 == 0 || verbose_mode >= VB_MED) {
+            cout << ((iqp_assess_quartet == IQP_BOOTSTRAP) ? "Bootstrap " : "Iteration ") << stop_rule.getCurIt() <<
+            " / LogL: ";
+            cout << curScore;
+            cout << " / Time: " << convert_time(getRealTime() - params->start_real_time);
+
+            if (stop_rule.getCurIt() > 20) {
+                cout << " (" << convert_time(realtime_remaining) << " left)";
+            }
+            if (MPIHelper::getInstance().getNumProcesses() > 1)
+                cout << " / Process: " << sourceProcID;
+            cout << endl;
+        }
+}
+
+//void IQTree::estimateLoglCutoffBS() {
+//    if (params->avoid_duplicated_trees && max_candidate_trees > 0 && treels_logl.size() > 1000) {
+//        int predicted_iteration;
+//        predicted_iteration = ((stop_rule.getCurIt() + params->step_iterations - 1) / params->step_iterations)
+//                              * params->step_iterations;
+//        int num_entries = (int) floor(max_candidate_trees * ((double) stop_rule.getCurIt() / predicted_iteration));
+//        if (num_entries < treels_logl.size() * 0.9) {
+//            DoubleVector logl = treels_logl;
+//            nth_element(logl.begin(), logl.begin() + (treels_logl.size() - num_entries), logl.end());
+//            logl_cutoff = logl[treels_logl.size() - num_entries] - 1.0;
+//        } else
+//            logl_cutoff = 0.0;
+//        if (verbose_mode >= VB_MED) {
+//            if (stop_rule.getCurIt() % 10 == 0) {
+//                cout << treels.size() << " trees, " << treels_logl.size() << " logls, logl_cutoff= " << logl_cutoff;
+//                if (params->store_candidate_trees)
+//                    cout << " duplicates= " << duplication_counter << " ("
+//                    << (int) round(100 * ((double) duplication_counter / treels_logl.size())) << "%)" << endl;
+//                else
+//                    cout << endl;
+//            }
+//        }
+//    }
+//}
+
+#ifdef _IQTREE_MPI
+bool IQTree::MPI_CollectTrees(bool allTrees, int maxNumTrees, bool updateStopRule) {
+    if (MPIHelper::getInstance().getNumProcesses() == 1)
+        return false;
+    TreeCollection outTrees;
+    double start = getRealTime();
+    MPIHelper::getInstance().receiveTrees(allTrees, maxNumTrees, outTrees, TREE_TAG);
+    double commTime = getRealTime() - start;
+    if (verbose_mode >= VB_MED && outTrees.getNumTrees()> 0) {
+        cout << outTrees.getNumTrees() << " trees received from other processes in ";
+        cout << commTime << " seconds" << endl;
+    }
+    if (commTime > 1.0) {
+        cout << "WARNING: Communication time (" << commTime << " sec) is too slow. Please increase MP_BUFFER_MEM and MP_EAGER_LIMIT" << endl;
+    }
+
+//    PhyloTree phyloTree;
+//    phyloTree.aln = this->aln;
+//    phyloTree.setParams(&(Params::getInstance()));
+
+    bool candidateset_changed = false;
+
+    for (int i = 0; i < outTrees.getNumTrees(); i++) {
+        pair<string, double> tree = outTrees.getTree(i);
+        if (tree.first == "notree") {
+            if (updateStopRule) {
+                stop_rule.setCurIt(stop_rule.getCurIt() + 1);
+                curScore = tree.second;
+                cout << "Bad tree with score: " << tree.second << " skipped" << endl;
+                printIterationInfo(outTrees.getSourceProcID()[i]);
+            }
+        } else {
+//            phyloTree.readTreeString(tree.first, true);
+//            string treeString = phyloTree.getTreeString();
+            int pos = addTreeToCandidateSet(tree.first, tree.second, updateStopRule, outTrees.getSourceProcID()[i]);
+            if (pos >= 0 && pos < params->popSize)
+            	candidateset_changed = true;
+        }
+    }
+    return candidateset_changed;
+}
+#endif
+
+double IQTree::doTreePerturbation() {
+    if (iqp_assess_quartet == IQP_BOOTSTRAP) {
+        // create bootstrap sample
+        Alignment *bootstrap_alignment;
+        if (aln->isSuperAlignment())
+            bootstrap_alignment = new SuperAlignment;
+        else
+            bootstrap_alignment = new Alignment;
+        bootstrap_alignment->createBootstrapAlignment(aln, NULL, params->bootstrap_spec);
+        setAlignment(bootstrap_alignment);
+        initializeAllPartialLh();
+        clearAllPartialLH();
+        curScore = optimizeAllBranches();
+    } else {
+        if (params->snni) {
+            if (Params::getInstance().five_plus_five) {
+                readTreeString(candidateTrees.getNextCandTree());
+            } else {
+                readTreeString(candidateTrees.getRandTopTree(Params::getInstance().popSize));
+            }
+            if (Params::getInstance().iqp) {
+                doIQP();
+            } else if (Params::getInstance().adaptPertubation) {
+                perturbStableSplits(Params::getInstance().stableSplitThreshold);
+            } else {
+                doRandomNNIs(Params::getInstance().tabu);
+            }
+        } else {
+            // Using the IQPNNI algorithm (best tree is selected)
+            readTreeString(getBestTrees()[0]);
+            doIQP();
+        }
+        if (params->count_trees) {
+            string perturb_tree_topo = getTopologyString(false);
+            if (pllTreeCounter.find(perturb_tree_topo) == pllTreeCounter.end()) {
+                // not found in hash_map
+                pllTreeCounter[perturb_tree_topo] = 1;
+            } else {
+                // found in hash_map
+                pllTreeCounter[perturb_tree_topo]++;
+            }
+        }
+        //optimizeBranches(1);
+        curScore = computeLogL();
+    }
+    return curScore;
 }
 
 /****************************************************************************
  Fast Nearest Neighbor Interchange by maximum likelihood
  ****************************************************************************/
-string IQTree::doNNISearch(int& nniCount, int& nniSteps) {
-	string treeString;
+pair<int, int> IQTree::doNNISearch() {
+
+    computeLogL();
+    double curBestScore = getBestScore();
+
+    if (Params::getInstance().write_intermediate_trees && save_all_trees != 2) {
+        printIntermediateTree(WT_NEWLINE | WT_APPEND | WT_SORT_TAXA | WT_BR_LEN);
+    }
+
+    pair<int, int> nniInfos; // Number of NNIs and number of steps
     if (params->pll) {
     	if (params->partition_file)
     		outError("Unsupported -pll -sp combination!");
-        curScore = pllOptimizeNNI(nniCount, nniSteps, searchinfo);
+        curScore = pllOptimizeNNI(nniInfos.first, nniInfos.second, searchinfo);
         pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions, pllInst->start->back, PLL_TRUE,
                 PLL_TRUE, 0, 0, 0, PLL_SUMMARIZE_LH, 0, 0);
-        treeString = string(pllInst->tree_string);
-//        readTreeString(treeString);
+        readTreeString(string(pllInst->tree_string));
     } else {
-        curScore = optimizeNNI(nniCount, nniSteps);
+        nniInfos = optimizeNNI(Params::getInstance().speednni);
         if (isSuperTree()) {
             ((PhyloSuperTree*) this)->computeBranchLengths();
         }
-        treeString = getTreeString();
         if (params->print_trees_site_posterior)
             computePatternCategories();
     }
-    return treeString;
+    // Better tree or score is found
+    if (getCurScore() > curBestScore + params->modelEps) {
+        // Re-optimize model parameters (the sNNI algorithm)
+        optimizeModelParameters(false, params->modelEps * 10);
+        getModelFactory()->saveCheckpoint();
+    }
+    MPIHelper::getInstance().setNumNNISearch(MPIHelper::getInstance().getNumNNISearch() + 1);
+
+    return nniInfos;
 }
 
-double IQTree::optimizeNNI(int &nni_count, int &nni_steps) {
-    bool rollBack = false;
-    nni_count = 0;
-    int numNNIs = 0; // number of NNI to be applied in each step
-    const int MAXSTEPS = aln->getNSeq(); // maximum number of NNI steps
-    NodeVector nodes1, nodes2;
-    DoubleVector lenvec;
-    for (nni_steps = 1; nni_steps <= MAXSTEPS; nni_steps++) {
+pair<int, int> IQTree::optimizeNNI(bool speedNNI) {
+    unsigned int totalNNIApplied = 0;
+    unsigned int numSteps = 0;
+    const int MAXSTEPS = leafNum;
+    unsigned int numInnerBranches = leafNum - 3;
+    double curBestScore = candidateTrees.getBestScore();
+
+    Branches nniBranches;
+    Branches nonNNIBranches;
+    vector<NNIMove> positiveNNIs;
+    vector<NNIMove> appliedNNIs;
+    SplitIntMap tabuSplits;
+    if (!initTabuSplits.empty()) {
+        tabuSplits = initTabuSplits;
+    }
+
+    for (numSteps = 1; numSteps <= MAXSTEPS; numSteps++) {
+
+//        cout << "numSteps = " << numSteps << endl;
         double oldScore = curScore;
-        if (!rollBack) { // tree get improved and was not rollbacked
-            if (save_all_trees == 2) {
-                saveCurrentTree(curScore); // BQM: for new bootstrap
-            }
-            if (verbose_mode >= VB_DEBUG) {
-                cout << "Doing NNI round " << nni_steps << endl;
-                if (isSuperTree()) {
-                    ((PhyloSuperTree*) this)->printMapInfo();
-                }
+        if (save_all_trees == 2) {
+            saveCurrentTree(curScore); // BQM: for new bootstrap
+        }
+        if (verbose_mode >= VB_DEBUG) {
+            cout << "Doing NNI round " << numSteps << endl;
+            if (isSuperTree()) {
+                ((PhyloSuperTree*) this)->printMapInfo();
             }
+        }
 
-            nonConfNNIs.clear(); // Vector containing non-conflicting positive NNIs
-//            optBrans.clear(); // Vector containing branch length of the positive NNIs
-//            orgBrans.clear(); // Vector containing all current branch of the tree
-            plusNNIs.clear(); // Vector containing all positive NNIs
-//            saveBranches(); // save all current branch lengths
-            saveBranchLengths(lenvec);
-            initPartitionInfo(); // for super tree
-            int numRemoved;
-            if (nodes1.size() == 0) {
-            	assert (nodes2.size() == 0);
-            	getAllInnerBranches(nodes1, nodes2, &candidateTrees.getStableSplits());
-            	assert(nodes1.size() == (aln->getNSeq() - 3 - candidateTrees.getStableSplits().size()));
-            } else {
-            	// exclude stable splits from NNI evaluation
-                numRemoved = removeBranches(nodes1, nodes2, candidateTrees.getStableSplits());
-            }
-//            cout << "Number of splits removed: " << numRemoved << endl;
-            assert(nodes1.size() == nodes2.size());
-//            for (int i = 0; i < nodes1.size(); i++) {
-//            	cout << "(" << nodes1[i]->id << "," << nodes2[i]->id << ") ; ";
-//            }
-//            cout << endl;
-//            printTree(cout, WT_TAXON_ID + WT_INT_NODE + WT_NEWLINE);
-            evalNNIs(nodes1, nodes2);
-
-//            if (!nni_sort) {
-//                evalNNIs(); // generate all positive NNI moves
-//            } else {
-//                evalNNIsSort(params->approximate_nni);
-//            }
+        // save all current branch lengths
+        DoubleVector lenvec;
+    	saveBranchLengths(lenvec);
 
-            /* sort all positive NNI moves (descending) */
-            sort(plusNNIs.begin(), plusNNIs.end());
-            if (verbose_mode >= VB_DEBUG) {
-                cout << "curScore: " << curScore << endl;
-                for (int i = 0; i < plusNNIs.size(); i++) {
-                    cout << "Logl of positive NNI " << i << " : " << plusNNIs[i].newloglh << endl;
-                }
-            }
+        // for super tree
+        initPartitionInfo();
 
-            if (plusNNIs.size() == 0) {
-                break;
-            }
+        nniBranches.clear();
+        nonNNIBranches.clear();
+
+        bool startSpeedNNI;
+        // When tabu and speednni are combined, speednni is only start from third steps
+        if (!initTabuSplits.empty() && numSteps < 3) {
+            startSpeedNNI = false;
+        } else if (speedNNI && !appliedNNIs.empty()) {
+            startSpeedNNI = true;
+        } else {
+            startSpeedNNI = false;
+        }
+
+        if (startSpeedNNI) {
+            // speedNNI option: only evaluate NNIs that are 2 branches away from the previously applied NNI
+            Branches filteredNNIBranches;
+            filterNNIBranches(appliedNNIs, filteredNNIBranches);
+            for (Branches::iterator it = filteredNNIBranches.begin(); it != filteredNNIBranches.end(); it++) {
+                Branch curBranch = it->second;
+                PhyloNeighbor* nei = (PhyloNeighbor*) curBranch.first->findNeighbor(curBranch.second);
+                Split* curSplit = nei->split;
+                bool tabu = false;
+                bool stable = false;
+                if (!tabuSplits.empty()) {
+                    int value;
+                    if (tabuSplits.findSplit(curSplit, value) != NULL)
+                        tabu = true;
+                }
+                if (!candidateTrees.getCandSplits().empty()) {
+                    int value;
+                    if (candidateTrees.getCandSplits().findSplit(curSplit, value) != NULL)
+                        stable = true;
 
-            /* remove conflicting NNIs */
-            genNonconfNNIs();
-            numNNIs = nonConfNNIs.size();
-            if (verbose_mode >= VB_DEBUG) {
-                for (int i = 0; i < nonConfNNIs.size(); i++) {
-                    cout << "Log-likelihood of non-conflicting NNI " << i << " : " << nonConfNNIs[i].newloglh << endl;
+                }
+                if (!tabu && !stable) {
+                    int branchID =  pairInteger(curBranch.first->id, curBranch.second->id);
+                    nniBranches.insert(pair<int, Branch>(branchID, curBranch));
                 }
             }
+        } else {
+            getNNIBranches(tabuSplits, candidateTrees.getCandSplits(), nonNNIBranches, nniBranches);
         }
-        // Apply all non-conflicting positive NNIs
-        doNNIs(numNNIs);
 
-        if (verbose_mode >= VB_DEBUG) {
-        	cout << "NNI step: " << nni_steps << " / Number of NNIs applied: " << numNNIs << endl;
+        if (!tabuSplits.empty()) {
+            tabuSplits.clear();
         }
-    	nodes1.clear();
-    	nodes2.clear();
 
-        if (searchinfo.speednni) {
-        	getBranchesForNNI(nodes1, nodes2, appliedNNIs);
-            appliedNNIs.clear();
+        positiveNNIs.clear();
+        evaluateNNIs(nniBranches, positiveNNIs);
+
+        if (positiveNNIs.size() == 0) {
+            if (!nonNNIBranches.empty() && totalNNIApplied == 0) {
+                evaluateNNIs(nonNNIBranches, positiveNNIs);
+                if (positiveNNIs.size() == 0) {
+                    break;
+                }
+            } else {
+                break;
+            }
         }
 
-        // FOR TUNG: If you want to introduce this heuristic, please confirm with reevaluation again.
-//        if (numNNIs > 1) {
-            // Re-estimate branch lengths of the new tree
-            curScore = optimizeAllBranches(1, params->loglh_epsilon, PLL_NEWZPERCYCLE);
-//        } else {
-//        	curScore = computeLikelihood();
-//        }
+        /* sort all positive NNI moves (ASCENDING) */
+        sort(positiveNNIs.begin(), positiveNNIs.end());
 
+        /* remove conflicting NNIs */
+        appliedNNIs.clear();
+        getCompatibleNNIs(positiveNNIs, appliedNNIs);
 
-		// curScore should be larger than score of the best NNI
-        if (curScore >= nonConfNNIs.at(0).newloglh - params->loglh_epsilon) {
-            nni_count += numNNIs;
-            rollBack = false;
-        	if (params->reduction) {
-        		string newickToplogy = getTopology();
-        		string newickString = getTreeString();
-            	if (candidateTrees.treeTopologyExist(newickToplogy)) {
-            		double oldScore = candidateTrees.getTopologyScore(newickToplogy);
-            		if (curScore > oldScore)
-    					candidateTrees.update(newickString, curScore, false);
-            		break;
-            	} else {
-					candidateTrees.update(newickString, curScore, false);
-            	}
-        	}
-        } else {
-            /* tree cannot be worse if only 1 NNI is applied */
-            if (numNNIs == 1 && curScore < nonConfNNIs.at(0).newloglh - 1.0) {
-            	cout.precision(15);
-                cout << "BUG: current logl=" << curScore << " < " << nonConfNNIs.at(0).newloglh
-                        << "(best NNI)" << endl;
-                assert(0);
-            }
-            if (verbose_mode >= VB_MED) {
-                cout << "New score = " << curScore << " after applying " << numNNIs <<
-                        " is worse than score = " << nonConfNNIs.at(0).newloglh
-                        << " of the best NNI. Roll back tree ..." << endl;
-            }
+        // do non-conflicting positive NNIs
+        doNNIs(appliedNNIs);
+        curScore = optimizeAllBranches(1, params->loglh_epsilon, PLL_NEWZPERCYCLE);
 
-            // restore the tree by reverting all NNIs
-            for (int i = 0; i < numNNIs; i++)
-                doNNI(nonConfNNIs.at(i));
-            // restore the branch lengths
-//            restoreAllBrans();
+        if (curScore < appliedNNIs.at(0).newloglh - params->loglh_epsilon) {
+            //cout << "Tree getting worse: curScore = " << curScore << " / best score = " <<  appliedNNIs.at(0).newloglh << endl;
+            // tree cannot be worse if only 1 NNI is applied
+            assert(appliedNNIs.size() != 1);
+            doNNIs(appliedNNIs);
             restoreBranchLengths(lenvec);
-            // This is important because after restoring the branch lengths, all partial
-            // likelihood need to be cleared.
-//            if (params->lh_mem_save == LM_PER_NODE) {
-//                initializeAllPartialLh();
-//            } else
             clearAllPartialLH();
-            
-            // UPDATE: the following is not needed as clearAllPartialLH() is now also defined for SuperTree
-            // BQM: This was missing: one should also clear all subtrees of a supertree
-//            if (isSuperTree()) {
-//            	PhyloSuperTree *stree = (PhyloSuperTree*)this;
-//            	for (PhyloSuperTree::iterator it = stree->begin(); it != stree->end(); it++) {
-//            		(*it)->clearAllPartialLH();
-//            	}
-//            }
-            rollBack = true;
-            // only apply the best NNI
-            numNNIs = 1;
-            curScore = oldScore;
+            // only do the best NNI
+            appliedNNIs.resize(1);
+            doNNIs(appliedNNIs);
+//            doNNI(appliedNNIs[0]);
+            totalNNIApplied++;
+            curScore = optimizeAllBranches(1, params->loglh_epsilon, PLL_NEWZPERCYCLE);
+            assert(curScore > appliedNNIs.at(0).newloglh - params->loglh_epsilon);
+        } else {
+            totalNNIApplied += appliedNNIs.size();
         }
-        // BUG in following line, causing premature break by rollBack! that's why commented out 
-//        if (curScore - oldScore < 0.1)
-//        	break;
-    }
 
-    if (nni_count == 0 && verbose_mode >= VB_MED) {
-        cout << "NOTE: Tree is already NNI-optimized" << endl;
+
+        if (curScore - oldScore <  params->loglh_epsilon)
+            break;
+
+        if (params->snni && (curScore > curBestScore + 0.1)) {
+            curBestScore = curScore;
+        }
+
+        if (Params::getInstance().write_intermediate_trees && save_all_trees != 2) {
+            printIntermediateTree(WT_NEWLINE | WT_APPEND | WT_SORT_TAXA | WT_BR_LEN);
+        }
+
+        if (Params::getInstance().writeDistImdTrees) {
+            intermediateTrees.update(getTreeString(), curScore);
+        }
     }
-    if (nni_steps == MAXSTEPS) {
-    	cout << "WARNING: NNI search needs unusual large number of steps (" << MAXSTEPS << ") to converge!" << endl;
+
+    if (totalNNIApplied == 0 && verbose_mode >= VB_MED) {
+        cout << "NOTE: Input tree is already NNI-optimal" << endl;
     }
-    return curScore;
-}
 
-void IQTree::getBranchesForNNI(NodeVector& nodes1, NodeVector& nodes2, vector<NNIMove>& nnis) {
-	assert(nodes1.size() == nodes2.size());
-    for (vector<NNIMove>::iterator it = nnis.begin(); it != nnis.end(); it++) {
-    	if (!branchExist((*it).node1, (*it).node2, nodes1, nodes2)) {
-    		assert(isInnerBranch((*it).node1, (*it).node2));
-        	nodes1.push_back((*it).node1);
-        	nodes2.push_back((*it).node2);
+    if (numSteps == MAXSTEPS) {
+        cout << "WARNING: NNI search needs unusual large number of steps (" << numInnerBranches << ") to converge!" << endl;
     }
-    	getInnerBranches(nodes1, nodes2, 2, (*it).node1, (*it).node2);
-    	getInnerBranches(nodes1, nodes2, 2, (*it).node2, (*it).node1);
+    return make_pair(numSteps, totalNNIApplied);
 }
 
+void IQTree::filterNNIBranches(vector<NNIMove> &appliedNNIs, Branches &nniBranches) {
+    for (vector<NNIMove>::iterator it = appliedNNIs.begin(); it != appliedNNIs.end(); it++) {
+        Branch curBranch;
+        curBranch.first = it->node1;
+        curBranch.second = it->node2;
+        int branchID = pairInteger(it->node1->id, it->node2->id);
+        if (nniBranches.find(branchID) == nniBranches.end())
+            nniBranches.insert(pair<int,Branch>(branchID, curBranch));
+        getSurroundingInnerBranches(it->node1, it->node2, 2, nniBranches);
+        getSurroundingInnerBranches(it->node2, it->node1, 2, nniBranches);
+    }
 }
 
 double IQTree::pllOptimizeNNI(int &totalNNICount, int &nniSteps, SearchInfo &searchinfo) {
-    if((globalParam->online_bootstrap == PLL_TRUE) && (globalParam->gbo_replicates > 0)) {
+    if((globalParams->online_bootstrap == PLL_TRUE) && (globalParams->gbo_replicates > 0)) {
         pllInitUFBootData();
     }
     searchinfo.numAppliedNNIs = 0;
@@ -2467,34 +2920,35 @@ void IQTree::pllDestroyUFBootData(){
 }
 
 
-void IQTree::doNNIs(int nni2apply, bool changeBran) {
-    for (int i = 0; i < nni2apply; i++) {
-        doNNI(nonConfNNIs.at(i));
-        appliedNNIs.push_back(nonConfNNIs.at(i));
+void IQTree::doNNIs(vector<NNIMove> &compatibleNNIs, bool changeBran) {
+    for (vector<NNIMove>::iterator it = compatibleNNIs.begin(); it != compatibleNNIs.end(); it++) {
+		doNNI(*it);
         if (!params->leastSquareNNI && changeBran) {
             // apply new branch lengths
-            changeNNIBrans(nonConfNNIs.at(i));
+			changeNNIBrans(*it);
         }
     }
     // 2015-10-14: has to reset this pointer when read in
     current_it = current_it_back = NULL;
-    
+
 }
 
 
-void IQTree::genNonconfNNIs() {
-    for (vector<NNIMove>::iterator iterMove = plusNNIs.begin(); iterMove != plusNNIs.end(); iterMove++) {
-        bool choosen = true;
-        for (vector<NNIMove>::iterator iterNextMove = nonConfNNIs.begin(); iterNextMove != nonConfNNIs.end();
-                iterNextMove++) {
-            if ((*iterMove).node1 == (*(iterNextMove)).node1 || (*iterMove).node2 == (*(iterNextMove)).node1
-                    || (*iterMove).node1 == (*(iterNextMove)).node2 || (*iterMove).node2 == (*(iterNextMove)).node2) {
-                choosen = false;
+void IQTree::getCompatibleNNIs(vector<NNIMove> &nniMoves, vector<NNIMove> &compatibleNNIs) {
+    compatibleNNIs.clear();
+	for (vector<NNIMove>::iterator it1 = nniMoves.begin(); it1 != nniMoves.end(); it1++) {
+		bool select = true;
+		for (vector<NNIMove>::iterator it2 = compatibleNNIs.begin(); it2 != compatibleNNIs.end(); it2++) {
+			if ((*it1).node1 == (*(it2)).node1
+					|| (*it1).node2 == (*(it2)).node1
+					|| (*it1).node1 == (*(it2)).node2
+					|| (*it1).node2 == (*(it2)).node2) {
+				select = false;
                 break;
             }
         }
-        if (choosen) {
-            nonConfNNIs.push_back(*iterMove);
+		if (select) {
+            compatibleNNIs.push_back(*it1);
         }
     }
 }
@@ -2559,39 +3013,35 @@ void IQTree::setDelete(int _delete) {
     k_delete = _delete;
 }
 
-void IQTree::evalNNIs(PhyloNode *node, PhyloNode *dad) {
-    if (!node) {
-        node = (PhyloNode*) root;
-    }
-    // internal branch
-    if (!node->isLeaf() && dad && !dad->isLeaf()) {
-        NNIMove myMove = getBestNNIForBran(node, dad, NULL);
-        if (myMove.newloglh > curScore + params->loglh_epsilon) {
-            addPositiveNNIMove(myMove);
+void IQTree::evaluateNNIs(Branches &nniBranches, vector<NNIMove>  &positiveNNIs) {
+    for (Branches::iterator it = nniBranches.begin(); it != nniBranches.end(); it++) {
+        NNIMove nni = getBestNNIForBran((PhyloNode*) it->second.first, (PhyloNode*) it->second.second, NULL);
+        if (nni.newloglh > curScore) {
+            positiveNNIs.push_back(nni);
         }
-    }
 
-    FOR_NEIGHBOR_IT(node, dad, it){
-        evalNNIs((PhyloNode*) (*it)->node, node);
+        // synchronize tree during optimization step
+        if (MPIHelper::getInstance().isMaster() && candidateset_changed.size() > 0
+            && MPIHelper::getInstance().gotMessage()) {
+            syncCurrentTree();
+        }
     }
 }
 
-void IQTree::evalNNIs(NodeVector& nodes1, NodeVector& nodes2) {
-	if (!nodes1.empty()) {
-		assert(!nodes2.empty());
-		assert(nodes1.size() == nodes2.size());
-		NodeVector::iterator it1;
-		NodeVector::iterator it2;
-		for (it1 = nodes1.begin(), it2 = nodes2.begin(); it1 != nodes1.end() && it2 != nodes2.end(); it1++, it2++) {
-			assert(isInnerBranch(*it1, *it2));
-			NNIMove myMove = getBestNNIForBran((PhyloNode*) *it1, (PhyloNode*) *it2, NULL);
-        if (myMove.newloglh > curScore + params->loglh_epsilon) {
-            addPositiveNNIMove(myMove);
-        }
-		}
-	} else {
-		evalNNIs();
+//Branches IQTree::getReducedListOfNNIBranches(Branches &previousNNIBranches) {
+//    Branches resBranches;
+//    for (Branches::iterator it = previousNNIBranches.begin(); it != previousNNIBranches.end(); it++) {
+//        getSurroundingInnerBranches(it->second.first, it->second.second, 2, resBranches);
+//        getSurroundingInnerBranches(it->second.second, it->second.first, 2, resBranches);
+//    }
+//}
+
+double IQTree::optimizeNNIBranches(Branches &nniBranches) {
+    for (Branches::iterator it = nniBranches.begin(); it != nniBranches.end(); it++) {
+        optimizeOneBranch((PhyloNode*) it->second.first, (PhyloNode*) it->second.second, true, PLL_NEWZPERCYCLE);
     }
+    curScore = computeLikelihoodFromBuffer();
+    return curScore;
 }
 
 /**
@@ -2697,7 +3147,7 @@ void IQTree::saveCurrentTree(double cur_logl) {
 //    treels_logl.push_back(cur_logl);
 //    num_trees_for_rell++;
 
-    if (write_intermediate_trees)
+    if (Params::getInstance().write_intermediate_trees)
         printTree(out_treels, WT_NEWLINE | WT_BR_LEN);
 
     int nptn = getAlnNPattern();
@@ -2724,7 +3174,7 @@ void IQTree::saveCurrentTree(double cur_logl) {
         // online bootstrap
 //        int ptn;
 //        int updated = 0;
-        int nsamples = boot_samples.size();
+//        int nsamples = boot_samples.size();
         ostringstream ostr;
         string tree_str, tree_str_brlen;
         setRootNode(params->root);
@@ -2740,7 +3190,7 @@ void IQTree::saveCurrentTree(double cur_logl) {
         #ifdef _OPENMP
         #pragma omp parallel for
         #endif
-        for (int sample = 0; sample < nsamples; sample++) {
+        for (int sample = sample_start; sample < sample_end; sample++) {
             double rell = 0.0;
 
             {
@@ -2771,7 +3221,7 @@ void IQTree::saveCurrentTree(double cur_logl) {
             }
         }
     }
-    if (print_tree_lh) {
+    if (Params::getInstance().print_tree_lh) {
         out_treelh << cur_logl;
         double prob;
 #ifdef BOOT_VAL_FLOAT
@@ -2887,7 +3337,7 @@ void IQTree::summarizeBootstrap(Params &params, MTreeSet &trees) {
     freeNode();
     // RARE BUG FIX: to avoid cases that identical seqs were removed and leaf name happens to be IDs
     MTree::readTree(tree_stream, rooted);
-    
+
     assignLeafNames();
     if (isSuperTree()) {
         ((PhyloSuperTree*) this)->mapTrees();
@@ -3073,9 +3523,9 @@ double IQTree::computeBootstrapCorrelation() {
     return corr;
 }
 
-void IQTree::addPositiveNNIMove(NNIMove myMove) {
-    plusNNIs.push_back(myMove);
-}
+//void IQTree::addPositiveNNIMove(NNIMove myMove) {
+//    plusNNIs.push_back(myMove);
+//}
 
 void IQTree::printResultTree(string suffix) {
     setRootNode(params->root);
@@ -3083,13 +3533,18 @@ void IQTree::printResultTree(string suffix) {
         return;
     string tree_file_name = params->out_prefix;
     tree_file_name += ".treefile";
+    if (MPIHelper::getInstance().isWorker()) {
+        return;
+        stringstream processTreeFile;
+        processTreeFile << tree_file_name << "." << MPIHelper::getInstance().getProcessID();
+        tree_file_name = processTreeFile.str();
+    }
     if (suffix.compare("") != 0) {
-        string iter_tree_name = tree_file_name + "." + suffix;
-        printTree(iter_tree_name.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
-    } else {
-        printTree(tree_file_name.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
+        tree_file_name += "." + suffix;
     }
-    //printTree(tree_file_name.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH);
+    printTree(tree_file_name.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
+    if (verbose_mode >= VB_MED)
+        cout << "Best tree printed to " << tree_file_name << endl;
 }
 
 void IQTree::printResultTree(ostream &out) {
@@ -3097,6 +3552,18 @@ void IQTree::printResultTree(ostream &out) {
     printTree(out, WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
 }
 
+void IQTree::printBestCandidateTree() {
+    if (MPIHelper::getInstance().isWorker())
+        return;
+    string tree_file_name = params->out_prefix;
+    tree_file_name += ".treefile";
+    readTreeString(candidateTrees.getBestTreeStrings(1)[0]);
+    setRootNode(params->root);
+    printTree(tree_file_name.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
+    if (verbose_mode >= VB_MED)
+        cout << "Best tree printed to " << tree_file_name << endl;
+}
+
 
 void IQTree::printPhylolibTree(const char* suffix) {
     pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions, pllInst->start->back, PLL_TRUE, 1, 0, 0, 0,
@@ -3119,8 +3586,9 @@ void IQTree::printIntermediateTree(int brtype) {
         computePatternLikelihood(pattern_lh, &logl);
     }
 
-    if (write_intermediate_trees)
+    if (Params::getInstance().write_intermediate_trees)
         printTree(out_treels, brtype);
+
     if (params->print_tree_lh) {
         out_treelh.precision(10);
         out_treelh << logl;
@@ -3140,8 +3608,286 @@ void IQTree::printIntermediateTree(int brtype) {
     }
     int x = save_all_trees;
     save_all_trees = 2;
-    evalNNIs();
+	// TODO Why is evalNNI() is called in this function?
+	//evalNNIs();
+	Branches innerBranches;
+    vector<NNIMove> positiveNNIs;
+	getInnerBranches(innerBranches);
+    evaluateNNIs(innerBranches, positiveNNIs);
     save_all_trees = x;
 }
 
 
+void IQTree::convertNNI2Splits(SplitIntMap &nniSplits, int numNNIs, vector<NNIMove> &compatibleNNIs) {
+    for (int i = 0; i < numNNIs; i++) {
+        Split *sp = new Split(*getSplit(compatibleNNIs[i].node1, compatibleNNIs[i].node2));
+        if (sp->shouldInvert()) {
+            sp->invert();
+        }
+        nniSplits.insertSplit(sp, 1);
+    }
+}
+
+double IQTree::getBestScore() {
+    return candidateTrees.getBestScore();
+}
+
+vector<string> IQTree::getBestTrees(int numTrees) {
+    return candidateTrees.getBestTreeStrings(numTrees);
+}
+
+
+/*******************************************
+    MPI stuffs
+*******************************************/
+
+void IQTree::syncCandidateTrees(int nTrees, bool updateStopRule) {
+    if (MPIHelper::getInstance().getNumProcesses() == 1)
+        return;
+
+#ifdef _IQTREE_MPI
+    // gather trees to Master
+
+    Checkpoint *ckp = new Checkpoint;
+
+    if (MPIHelper::getInstance().isMaster()) {
+        // update candidate set at master
+        int trees = 0;
+        for (int w = 1; w < MPIHelper::getInstance().getNumProcesses(); w++) {
+            int worker = MPIHelper::getInstance().recvCheckpoint(ckp);
+            CandidateSet cset;
+            cset.setCheckpoint(ckp);
+            cset.restoreCheckpoint();
+            for (CandidateSet::iterator it = cset.begin(); it != cset.end(); it++)
+                addTreeToCandidateSet(it->second.tree, it->second.score, updateStopRule, worker);
+            trees += ckp->size();
+            ckp->clear();
+        }
+        cout << trees << " candidate trees gathered from workers" << endl;
+        // get the best candidate trees
+        int numTrees = max(nTrees, MPIHelper::getInstance().getNumProcesses());
+        CandidateSet bestCandidates = candidateTrees.getBestCandidateTrees(numTrees);
+        int saved_numNNITrees = params->numNNITrees;
+        params->numNNITrees = numTrees;
+        bestCandidates.setCheckpoint(ckp);
+        bestCandidates.saveCheckpoint();
+        params->numNNITrees = saved_numNNITrees;
+    } else {
+        // send candidate set to master
+        CandidateSet cset = candidateTrees.getBestCandidateTrees();
+        cset.setCheckpoint(ckp);
+        cset.saveCheckpoint();
+        MPIHelper::getInstance().sendCheckpoint(ckp, PROC_MASTER);
+        cout << ckp->size() << " candidate trees sent to master" << endl;
+        ckp->clear();
+    }
+
+    // broadcast candidate trees from master to worker
+    MPIHelper::getInstance().broadcastCheckpoint(ckp);
+    cout << ckp->size() << " trees broadcasted to workers" << endl;
+
+    if (MPIHelper::getInstance().isWorker()) {
+        // update candidate set at worker
+        CandidateSet cset;
+        cset.setCheckpoint(ckp);
+        cset.restoreCheckpoint();
+        for (CandidateSet::iterator it = cset.begin(); it != cset.end(); it++)
+            addTreeToCandidateSet(it->second.tree, it->second.score, false, PROC_MASTER);
+    }
+
+    delete ckp;
+#endif
+}
+
+void IQTree::syncCurrentTree() {
+    if (MPIHelper::getInstance().getNumProcesses() == 1)
+        return;
+#ifdef _IQTREE_MPI
+    //------ BLOCKING COMMUNICATION ------//
+    Checkpoint *checkpoint = new Checkpoint;
+    string tree;
+    double score;
+
+    if (MPIHelper::getInstance().isMaster()) {
+        // master: receive tree from WORKERS
+        int worker = MPIHelper::getInstance().recvCheckpoint(checkpoint);
+        MPIHelper::getInstance().increaseTreeReceived();
+        CKP_RESTORE(tree);
+        CKP_RESTORE(score);
+        int pos = addTreeToCandidateSet(tree, score, true, worker);
+        if (pos >= 0 && pos < params->popSize) {
+            // candidate set is changed, update for other workers
+            for (int w = 0; w < candidateset_changed.size(); w++)
+                if (w != worker)
+                    candidateset_changed[w] = true;
+        }
+
+        if (boot_samples.size() > 0) {
+            restoreUFBoot(checkpoint);
+        }
+
+        // send candidate trees to worker
+        checkpoint->clear();
+        if (boot_samples.size() > 0)
+            CKP_SAVE(logl_cutoff);
+        if (candidateset_changed[worker]) {
+            CandidateSet cset = candidateTrees.getBestCandidateTrees(Params::getInstance().popSize);
+            cset.setCheckpoint(checkpoint);
+            cset.saveCheckpoint();
+            candidateset_changed[worker] = false;
+            MPIHelper::getInstance().increaseTreeSent(Params::getInstance().popSize);
+        }
+        MPIHelper::getInstance().sendCheckpoint(checkpoint, worker);
+    } else {
+        // worker: always send tree to MASTER
+        tree = getTreeString();
+        score = curScore;
+        CKP_SAVE(tree);
+        CKP_SAVE(score);
+        if (boot_samples.size() > 0) {
+            saveUFBoot(checkpoint);
+        }
+        MPIHelper::getInstance().sendCheckpoint(checkpoint, PROC_MASTER);
+        MPIHelper::getInstance().increaseTreeSent();
+
+        // now receive the candidate set
+        MPIHelper::getInstance().recvCheckpoint(checkpoint, PROC_MASTER);
+        if (checkpoint->getBool("stop")) {
+            cout << "Worker gets STOP message!" << endl;
+            stop_rule.shouldStop();
+        } else {
+            CandidateSet cset;
+            cset.setCheckpoint(checkpoint);
+            cset.restoreCheckpoint();
+            for (CandidateSet::iterator it = cset.begin(); it != cset.end(); it++)
+                addTreeToCandidateSet(it->second.tree, it->second.score, false, MPIHelper::getInstance().getProcessID());
+            MPIHelper::getInstance().increaseTreeReceived(cset.size());
+            if (boot_samples.size() > 0)
+                CKP_RESTORE(logl_cutoff);
+        }
+    }
+
+    delete checkpoint;
+
+#endif
+}
+
+void IQTree::sendStopMessage() {
+    if (MPIHelper::getInstance().getNumProcesses() == 1)
+        return;
+#ifdef _IQTREE_MPI
+
+    Checkpoint *checkpoint = new Checkpoint;
+    checkpoint->putBool("stop", true);
+    stringstream ss;
+    checkpoint->dump(ss);
+    string str = ss.str();
+    string tree;
+    double score;
+
+    cout << "Sending STOP message to workers" << endl;
+
+    // send STOP message to all processes
+    if (MPIHelper::getInstance().isMaster()) {
+        // repeatedly send stop message to all workers
+        for (int w = 1; w < MPIHelper::getInstance().getNumProcesses(); w++) {
+//            string buf;
+//            int worker = MPIHelper::getInstance().recvString(buf);
+            checkpoint->clear();
+            int worker = MPIHelper::getInstance().recvCheckpoint(checkpoint);
+            MPIHelper::getInstance().increaseTreeReceived();
+            CKP_RESTORE(tree);
+            CKP_RESTORE(score);
+            addTreeToCandidateSet(tree, score, true, worker);
+            MPIHelper::getInstance().sendString(str, worker, TREE_TAG);
+        }
+    }
+
+    delete checkpoint;
+
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+}
+
+
+int PhyloTree::testNumThreads() {
+#ifndef _OPENMP
+    return 1;
+#else
+	int max_procs = countPhysicalCPUCores();
+    cout << "Measuring multi-threading efficiency up to " << max_procs << " CPU cores" << endl;
+    DoubleVector runTimes;
+    int bestProc = 0;
+    double saved_curScore = curScore;
+    int num_iter = 1;
+
+    // generate different trees
+    int tree;
+    double min_time = max_procs; // minimum time in seconds
+    StrVector trees;
+    trees.push_back(getTreeString());
+
+    for (int proc = 1; proc <= max_procs; proc++) {
+
+        omp_set_num_threads(proc);
+        setLikelihoodKernel(sse, proc);
+        initializeAllPartialLh();
+
+        double beginTime = getRealTime();
+        double runTime, logl;
+
+        for (tree = 0; tree < trees.size(); tree++) {
+            readTreeString(trees[tree]);
+            logl = optimizeAllBranches(num_iter);
+            runTime = getRealTime() - beginTime;
+
+            // too fast, increase number of iterations
+            if (runTime*10 < min_time && proc == 1 && tree == 0) {
+                int new_num_iter = 10;
+                cout << "Increase to " << new_num_iter << " rounds for branch lengths" << endl;
+                logl = optimizeAllBranches(new_num_iter - num_iter);
+                num_iter = new_num_iter;
+                runTime = getRealTime() - beginTime;
+            }
+
+            // considering at least 2 trees
+            if ((runTime < min_time && proc == 1) || trees.size() == 1) {
+                // time not reached, add more tree
+//                readTreeString(trees[0]);
+//                doRandomNNIs();
+                generateRandomTree(YULE_HARDING);
+                wrapperFixNegativeBranch(true);
+                trees.push_back(getTreeString());
+            }
+            curScore = saved_curScore;
+        }
+
+        if (proc == 1)
+            cout << trees.size() << " trees examined" << endl;
+
+        deleteAllPartialLh();
+
+        runTimes.push_back(runTime);
+        double speedup = runTimes[0] / runTime;
+
+        cout << "Threads: " << proc << " / Time: " << runTime << " sec / Speedup: " << speedup
+            << " / Efficiency: " << (int)round(speedup*100/proc) << "% / LogL: " << (int)logl << endl;
+
+        // break if too bad efficiency ( < 50%) or worse than than 10% of the best run time
+        if (speedup*2 <= proc || (runTime > runTimes[bestProc]*1.1 && proc>1))
+            break;
+
+        // update best threads if sufficient
+        if (runTime <= runTimes[bestProc]*0.95)
+            bestProc = proc-1;
+
+    }
+
+    readTreeString(trees[0]);
+
+    cout << "BEST NUMBER OF THREADS: " << bestProc+1 << endl << endl;
+    setLikelihoodKernel(sse, bestProc+1);
+
+    return bestProc+1;
+#endif
+}
diff --git a/iqtree.h b/iqtree.h
index 1a34ff0..b359971 100644
--- a/iqtree.h
+++ b/iqtree.h
@@ -1,6 +1,8 @@
 /***************************************************************************
- *   Copyright (C) 2009 by BUI Quang Minh   *
- *   minh.bui at univie.ac.at   *
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
@@ -74,9 +76,7 @@ inline int int_branch_cmp(const IntBranchInfo a, const IntBranchInfo b) {
 typedef multiset<RepLeaf*, nodeheightcmp> RepresentLeafSet;
 
 /**
-Important Quartet Puzzling
-
-        @author BUI Quang Minh <minh.bui at univie.ac.at>
+    Main class for tree search
  */
 class IQTree : public PhyloTree {
 public:
@@ -112,6 +112,19 @@ public:
     */
     virtual void restoreCheckpoint();
 
+    /**
+        save UFBoot_trees.
+        For MPI workers only save from sample_start to sample_end
+        @param checkpoint Checkpoint object
+    */
+    void saveUFBoot(Checkpoint *checkpoint);
+
+    /**
+
+        restore UFBoot_trees from sample_start to sample_end (MPI)
+        @param checkpoint Checkpoint object
+    */
+    void restoreUFBoot(Checkpoint *checkpoint);
 
     /**
      * setup all necessary parameters  (declared as virtual needed for phylosupertree)
@@ -136,6 +149,8 @@ public:
      */
     void printResultTree(ostream &out);
 
+    void printBestCandidateTree();
+
     /**
      * print phylolib tree to a file.
      * @param suffix suffix string for the tree file
@@ -207,18 +222,40 @@ public:
     void doIQP();
 
     /**
-     *  @brief remove all branches mapped to splits in \a split
-     *  @param nodes1 node vector containing one end of the branches
-     *  @param nodes2 node vector containing the other end of the branches
-     *  @return number of branches removed
+     *  @brief get non-tabu branches from a set of branches
+     *
+     *  @param
+     *  	allBranches[IN] the inital branches
+     *  @param
+     *  	initTabuSplits[IN] the tabu splits
+     *  @param
+     *		nonTabuBranches[OUT] non-tabu branches from \a allBranches
+     *	@param[OUT]
+     *		tabuBranches branches that are tabu
+     */
+    void getNonTabuBranches(Branches& allBranches, SplitGraph& tabuSplits, Branches& nonTabuBranches, Branches* tabuBranches = NULL);
+
+    /**
+     * @brief remove all branches corresponding to nnis
+     * @param nodes1 node vector containing one end of the branches
+     * @param nodes2 node vector containing the other end of the branches
+     * @param nnis
+     * @return
      */
-    int removeBranches(NodeVector& nodes1, NodeVector& nodes2, SplitGraph& splits);
+    int removeNNIBranches(NodeVector& nodes1, NodeVector& nodes2, unordered_map<string, NNIMove> nnis);
 
     /**
      * 		Perform a series of random NNI moves
-     * 		@param numNNI number of random NNIs
+     * 		@return the perturbed newick string
+     */
+    string doRandomNNIs(bool storeTabu = false);
+
+    /**
+     *  Do a random NNI on splits that are shared among all the candidate trees.
+     *  @return the perturbed newick string
      */
-    void doRandomNNIs(int numNNI);
+    string perturbStableSplits(double supportValue);
+
 
     /**
      *   input model parameters from IQ-TREE to PLL
@@ -271,6 +308,9 @@ public:
      */
     double swapTaxa(PhyloNode *node1, PhyloNode *node2);
 
+    /** collect boostrap trees from workers to master */
+    void collectBootTrees();
+
     /**
             perform tree search
             @return best likelihood found
@@ -288,13 +328,14 @@ public:
     /**
      *  Wrapper function to compute tree log-likelihood.
      *  This function with call either PLL or IQ-TREE to compute tree log-likelihood
+     *  @return current score of tree
      */
-    void computeLogL();
+    double computeLogL();
 
     /**
-     *	Print numBestScore found so far, starting from the highest
+     *	Print scores of tree used for generating offsprings
      */
-    void printBestScores(int numBestScore);
+    void printBestScores();
 
     /****************************************************************************
             Fast Nearest Neighbor Interchange by maximum likelihood
@@ -302,13 +343,79 @@ public:
 
 
     /**
-            This implement the fastNNI algorithm proposed in PHYML paper
-            TUNG: this is a virtual function, so it will be called automatically by optimizeNNIBranches()
-            @return best likelihood found
-            @param skipped (OUT) 1 if current iteration is skipped, otherwise 0
-            @param nni_count (OUT) the number of single NNI moves proceeded so far
+     *  Optimize current tree using NNI
+     *
+     *  @return
+     *      <number of NNI steps, number of NNIs> done
+     */
+    pair<int, int> optimizeNNI(bool speedNNI = true);
+
+    /**
+     *  Return the current best score found
+     */
+    double getBestScore();
+
+    /**
+     * @brief Generate a list of internal branches on which NNI moves will be evaluated
+     * @param
+     *      nonNNIBranches [OUT] Branches on which NNI evaluation will be skipped
+     * @param
+     *      tabuSplits [IN] A list of splits that are considered tabu
+     * @param
+     *      candidateSplitHash [IN] Lists that appear on the best 20 candidate trees
+     * @param
+     *      dad [IN] for navigation
+     * @param
+     *      node[IN] for navigation
+     * @return A list of branches for evaluating NNIs
+     */
+    void getNNIBranches(SplitIntMap &tabuSplits, SplitIntMap &candidateSplitHash, Branches &nonNNIBranches, Branches &outBranches, Node *dad = NULL, Node *node = NULL);
+
+    /**
+     *  Return internal branches that appear in \a candidateSplitHash
+     *  and has support value >= \a supportValue.
+     *  @param
+     *      candidateSplitHash [IN]   A set of splits with the number of occurences.
+     *  @param
+     *      supportValue [IN]  Only consider split whose support value is higher than this number
+     *  @param
+     *      dad [IN] for navigation
+     *  @param
+     *      node[IN] for navigation
+     *  @return
+     *      A list of branches fufilling the aforementioned conditions.
      */
-    double optimizeNNI(int &nni_count, int &nni_steps);
+    void getStableBranches(SplitIntMap &candSplits, double supportValue, Branches &outBranches, Node *dad = NULL, Node *node = NULL);
+
+
+    /**
+     *
+     *  Determine whether to evaluate NNI moves on the branch corresponding to the current split
+     *
+     *  @param curSplit [IN] the split that correspond to the current branch
+     *  @param tabuSplits [IN] tabu splits
+     *  @param candSplits [IN] splits contained in all candidate trees
+     *  @param nonNNIBranches [OUT] branches that are not inserted to nniBranches are store here
+     *  @param nniBranches [OUT] if the split is neither stable nor tabu it is inserted in this list
+     */
+    bool shouldEvaluate(Split* curSplit, SplitIntMap &tabuSplits, SplitIntMap &candSplits);
+
+
+    /**
+     *  @brief Only select NNI branches that are 2 branches away from the previously
+     *  appied NNIs
+     *  @param
+     *      appliedNNIs List of previously applied NNIs
+     *  @return
+     *      List of branches to be evaluated
+     */
+    void filterNNIBranches(vector<NNIMove> &appliedNNIs, Branches &outBranches);
+
+    
+    /**
+     * @brief get branches that correspond to the splits in \a nniSplits
+     */
+    void getSplitBranches(Branches &branches, SplitIntMap &splits, Node *dad = NULL, Node *node = NULL);
 
     /**
      * 		Do fastNNI using PLL
@@ -320,30 +427,29 @@ public:
 
     /**
      * 		@brief Perform NNI search on the current tree topology
+     * 		@return <number_of_NNIs, number_of_NNI_steps>
      * 		This function will automatically use the selected kernel (either PLL or IQ-TREE)
-     *
-     * 		@param nniCount (OUT) number of NNIs applied
-     * 		@param nniSteps (OUT) number of NNI steps done
-     * 		@return the new NEWICK string
      */
-    string doNNISearch(int &nniCount, int &nniSteps);
+    pair<int, int> doNNISearch();
 
     /**
-            @brief evaluate all NNIs and store them in possilbleNNIMoves list
+            @brief evaluate all NNIs
             @param  node    evaluate all NNIs of the subtree rooted at node
             @param  dad     a neighbor of \p node which does not belong to the subtree
                             being considered (used for traverse direction)
 
      */
-    void evalNNIs(PhyloNode *node = NULL, PhyloNode *dad = NULL);
+    //void evalNNIs(PhyloNode *node = NULL, PhyloNode *dad = NULL);
 
     /**
-     * @brief Evaluate all NNIs on branch defined by \a nodes1 and \a nodes2
+     * @brief Evaluate all NNIs on branch defined by \a branches
      *
-     * @param[in] nodes1 contains one ends of the branches for NNI evaluation
-     * @param[in] nodes2 contains the other ends of the branches for NNI evaluation
+     * @param nniBranches [IN] branches the branches on which NNIs will be evaluated
+     * @return list positive NNIs
      */
-    void evalNNIs(NodeVector &nodes1, NodeVector &nodes2);
+    void evaluateNNIs(Branches &nniBranches, vector<NNIMove> &outNNIMoves);
+
+    double optimizeNNIBranches(Branches &nniBranches);
 
     /**
             search all positive NNI move on the current tree and save them
@@ -352,11 +458,11 @@ public:
     void evalNNIsSort(bool approx_nni);
 
     /**
-            apply nni2apply NNIs from the non-conflicting NNI list
-            @param nni2apply number of NNIs to apply from the list
+            apply  NNIs from the non-conflicting NNI list
+            @param compatibleNNIs vector of all compatible NNIs
             @param changeBran whether or not the computed branch lengths should be applied
      */
-    virtual void doNNIs(int nni2apply, bool changeBran = true);
+    virtual void doNNIs(vector<NNIMove> &compatibleNNIs, bool changeBran = true);
 
     /**
      *  Restore the old 5 branch lengths stored in the NNI move.
@@ -365,43 +471,18 @@ public:
      */
     //void restoreNNIBranches(NNIMove nnimove);
 
-    /**
-            generate non conflicting NNI moves.
-            moves are saved in vec_nonconf_nni
-     */
-    void genNonconfNNIs();
-
-    /**
-            add a NNI move to the list of possible NNI moves;
-     */
-    void addPositiveNNIMove(NNIMove myMove);
-
-    /**
-     * 	Save all the current branch lengths
-     */
-//    void saveBranches(PhyloNode *node = NULL, PhyloNode *dad = NULL);
-
-    /**
-     * 	 Restore the branch lengths from the saved values
-     */
-//    virtual void restoreAllBrans(PhyloNode *node = NULL, PhyloNode *dad = NULL);
 
     /**
-     * Get the branch length of the branch node1-node2
-     * @param node1
-     * @param node2
-     * @return the branch length
+     *  @brief get a list of compatible NNIs from a list of NNIs
+     *  @param nniMoves [IN] list of NNIs
+     *  @return list of compatible NNIs
      */
-//    double getBranLen(PhyloNode *node1, PhyloNode *node2);
-
+    void getCompatibleNNIs(vector<NNIMove> &nniMoves, vector<NNIMove> &compatibleNNIs);
 
     /**
-            Described in PhyML paper: apply change to branch that does not
-            correspond to a swap with the following formula l = l + lamda(la - l)
-            @param node1 the first node of the branch
-            @param node2 the second node of the branch
+            add a NNI move to the list of possible NNI moves;
      */
-//    void changeBranLen(PhyloNode *node1, PhyloNode *node2, double branLen);
+    void addPositiveNNIMove(NNIMove &myMove);
 
     /**
      * Estimate the 95% quantile of the distribution of N (see paper for more d
@@ -482,6 +563,7 @@ public:
      */
     vector<int> vecNumNNI;
 
+
     /**
      * Do memory allocation and initialize parameter for UFBoot to run with PLL
      */
@@ -530,6 +612,11 @@ public:
 
 protected:
     /**
+    *  Splits corresponding to random NNIs
+    */
+    SplitIntMap initTabuSplits;
+
+    /**
             criterion to assess important quartet
      */
     IQP_ASSESS_QUARTET iqp_assess_quartet;
@@ -541,37 +628,11 @@ protected:
     NodeVector taxaSet;
 
     /**
-     * confidence value for number of NNIs found in one iteration
-     */
-    int nni_count_est;
-
-    /**
-     * confidence value for likelihood improvement made by one NNI
-     */
-    double nni_delta_est;
-
-
-    /**
      *  Vector contains approximated improvement pro NNI at each iterations
      */
     vector<double> vecImpProNNI;
 
     /**
-        List of positive NNI for the current tree;
-     */
-    vector<NNIMove> plusNNIs;
-
-    /**
-        List of non-conflicting NNIs for the current tree;
-     */
-    vector<NNIMove> nonConfNNIs;
-
-    /**
-     *  NNIs that have been applied in the previous step
-     */
-    vector<NNIMove> appliedNNIs;
-
-    /**
         Optimal branch lengths
      */
 //    mapString2Double optBrans;
@@ -582,26 +643,66 @@ protected:
      *  @param[out] nodes2 the other ends of the branches
      *  @param[in] nnis NNIs that have been previously applied
      */
-    void getBranchesForNNI(NodeVector& nodes1, NodeVector& nodes2, vector<NNIMove>& nnis);
+    void generateNNIBranches(NodeVector& nodes1, NodeVector& nodes2, unordered_map<string, NNIMove>& nnis);
+
+    int k_delete, k_delete_min, k_delete_max, k_delete_stay;
+
+    /**
+            number of representative leaves for IQP step
+     */
+    int k_represent;
+
+public:
 
     /**
-     *  Use fastNNI heuristic
+     *  Candidate tree set (the current best N (default N = 5)
+     *  NNI-optimal trees
      */
-    bool fastNNI;
+    CandidateSet candidateTrees;
 
     /**
-            Original branch lengths
+     *  Set of all intermediate trees (initial trees, tree generated by NNI steps,
+     *  NNI-optimal trees)
      */
-//    mapString2Double orgBrans;
+    CandidateSet intermediateTrees;
 
-    int k_delete, k_delete_min, k_delete_max, k_delete_stay;
 
     /**
-            number of representative leaves for IQP step
-     */
-    int k_represent;
+     *  Update the candidate set with a new NNI-optimal tree. The maximum size of the candidate set
+     *  is fixed to the initial setting. Thus, if the size exceed the maximum number of trees, the worse
+     *  tree will be removed.
+     *
+     *  @param treeString
+     *      the new tree
+     *  @param score
+     *      the score of the new tree
+     *  @param updateStopRule
+     *      Whether or not to update the stop rule
+     *  @return relative position of the new tree to the current best.
+     *      -1 if duplicated
+     *      -2 if the candidate set is not updated
+     */
+    int addTreeToCandidateSet(string treeString, double score, bool updateStopRule, int sourceProcID);
+
+    /**
+        MPI: synchronize candidate trees between all processes
+        @param nTrees number of trees to broadcast
+        @param updateStopRule true to update stopping rule, false otherwise
+    */
+    void syncCandidateTrees(int nTrees, bool updateStopRule);
 
-public:
+    /**
+        MPI: synchronize tree of current iteration with master
+        will update candidateset_changed
+        @param curTree current tree
+
+    */
+    void syncCurrentTree();
+
+    /**
+        MPI: Master sends stop message to all workers
+    */
+    void sendStopMessage();
 
     /**
      *  Generate the initial parsimony/random trees, called by initCandidateTreeSet
@@ -616,7 +717,6 @@ public:
      */
     void initCandidateTreeSet(int nParTrees, int nNNITrees);
 
-
     /**
      * Generate the initial tree (usually used for model parameter estimation)
      * @param dist_file only needed for BIONJ tree
@@ -637,9 +737,6 @@ public:
      */
     topol* pllBestTree;
 
-    CandidateSet candidateTrees;
-
-
     /****** following variables are for ultra-fast bootstrap *******/
 
     /** TRUE to save also branch lengths into treels_newick */
@@ -669,6 +766,12 @@ public:
     /** vector of bootstrap alignments generated */
     vector<BootValType* > boot_samples;
 
+    /** starting sample for UFBoot, used for MPI */
+    int sample_start;
+
+    /** end sample for UFBoot, used for MPI */
+    int sample_end;
+
     /** newick string of corresponding bootstrap trees */
     StrVector boot_trees;
 
@@ -715,7 +818,6 @@ protected:
      */
     vector<NNIInfo> nni_info;
 
-
     bool estimate_nni_cutoff;
 
     double nni_cutoff;
@@ -727,11 +829,12 @@ protected:
     ofstream outNNI;
 protected:
 
-    bool print_tree_lh;
+    //bool print_tree_lh;
 
-    int write_intermediate_trees;
+    //int write_intermediate_trees;
 
     ofstream out_treels, out_treelh, out_sitelh, out_treebetter;
+    string treels_name, out_lh_file, site_lh_file;
 
     void estimateNNICutoff(Params* params);
 
@@ -741,6 +844,12 @@ protected:
 
     int duplication_counter;
 
+    // MPI: vector of size = num processes, true if master should send candidate set to worker
+    BoolVector candidateset_changed;
+
+    // true if best candidate tree is changed
+    bool bestcandidate_changed;
+
     /**
             number of IQPNNI iterations
      */
@@ -843,9 +952,56 @@ protected:
 
     void estDeltaMin();
 
-};
+    void convertNNI2Splits(SplitIntMap &nniSplits, int numNNIs, vector<NNIMove> &compatibleNNIs);
+
+    string generateParsimonyTree(int randomSeed);
 
-void estimateNNICutoff(Params &params);
+#ifdef _IQTREE_MPI
+    /**
+     *  Receive trees from other processes and add them to the candidate set
+     *
+     *  @param allTrees
+     *      If true, wait for tree from every node
+     *      If false, only collect trees that have been sent
+     *  @param maxNumTrees
+     *      Only received up to maxNumTrees to prevent the function to block because it can constantly receive
+     *      new trees
+     *  @param updateStopRule
+     *      To update the stop rule or not
+     */
+    bool MPI_CollectTrees(bool allTrees, int maxNumTrees, bool updateStopRule);
+#endif
+
+    double doTreePerturbation();
+
+    void estimateLoglCutoffBS();
+
+    //void estimateNNICutoff(Params &params);
+
+public:
+    /**
+     *  Return best tree string from the candidate set
+     *
+     *  @param numTrees
+     *      Number of best trees to return
+     *  @return
+     *      A string vector of trees
+     */
+    vector<string> getBestTrees(int numTrees = 0);
 
+    /**
+     *  Print the iteration number and the tree score
+     */
+    void printIterationInfo(int sourceProcID);
 
+    /**
+     *  Return branches that are 2 branches away from the branches, on which NNIs were applied
+     *  in the previous NNI steps.
+     *  @param
+     *      previousNNIBranches[IN] a set of branches on which NNIs were performed in the previous NNI step.
+     *  @return
+     *      a set of branches, on which NNIs should be evaluated for the current NNI steps
+     */
+    Branches getReducedListOfNNIBranches(Branches &previousNNIBranches);
+};
 #endif
diff --git a/iqtree_config.h.in b/iqtree_config.h.in
index 9d3e17c..bf2233e 100644
--- a/iqtree_config.h.in
+++ b/iqtree_config.h.in
@@ -12,3 +12,6 @@
 /*#cmakedefine HAVE_PCLOSE*/
 /* does the platform provide GlobalMemoryStatusEx functions? */
 #cmakedefine HAVE_GLOBALMEMORYSTATUSEX
+
+/* does the platform provide backtrace functions? */
+#cmakedefine Backtrace_FOUND
diff --git a/lpwrapper.c b/lpwrapper.c
index 2b9ad4e..263d6f9 100644
--- a/lpwrapper.c
+++ b/lpwrapper.c
@@ -43,7 +43,7 @@ int lp_solve(char *filename, int ntaxa, double *score, double *variables, int ve
 	//write_lp(lp, name2);
 
 	if (lp == NULL) {
-		printf("Could not create an LP_SOLVE instance!\n");
+		printf("Could not create an LP_SOLVE getInstance!\n");
 		return 1;
 	}
 
diff --git a/memslot.cpp b/memslot.cpp
new file mode 100644
index 0000000..e56b807
--- /dev/null
+++ b/memslot.cpp
@@ -0,0 +1,254 @@
+/***************************************************************************
+ *   Copyright (C) 2009-2016 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *                                                                         *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include "phylotree.h"
+#include "memslot.h"
+
+const int MEM_LOCKED = 1;
+const int MEM_SPECIAL = 2;
+
+void MemSlotVector::init(PhyloTree *tree, int num_slot) {
+    if (Params::getInstance().lh_mem_save != LM_MEM_SAVE)
+        return;
+    reserve(num_slot+2);
+    resize(num_slot);
+    size_t lh_size = tree->getPartialLhSize();
+    size_t scale_size = tree->getScaleNumSize();
+    reset();
+    for (iterator it = begin(); it != end(); it++) {
+        it->partial_lh = tree->central_partial_lh + lh_size*(it-begin());
+        it->scale_num = tree->central_scale_num + scale_size*(it-begin());
+    }
+}
+
+void MemSlotVector::reset() {
+    if (Params::getInstance().lh_mem_save != LM_MEM_SAVE)
+        return;
+    for (iterator it = begin(); it != end(); it++) {
+        it->status = 0;
+        it->nei = NULL;
+    }
+    nei_id_map.clear();
+    free_count = 0;
+}
+
+
+MemSlotVector::iterator MemSlotVector::findNei(PhyloNeighbor *nei) {
+    auto it = nei_id_map.find(nei);
+    assert(it != nei_id_map.end());
+//    assert(at(it->second).nei == nei);
+    return begin()+it->second;
+}
+
+void MemSlotVector::addNei(PhyloNeighbor *nei, iterator it) {
+//    assert((it->status & MEM_SPECIAL) == 0);
+    nei->partial_lh = it->partial_lh;
+    nei->scale_num = it->scale_num;
+    it->nei = nei;
+    nei_id_map[nei] = it-begin();
+}
+
+
+void MemSlotVector::addSpecialNei(PhyloNeighbor *nei) {
+    if (Params::getInstance().lh_mem_save != LM_MEM_SAVE)
+        return;
+    MemSlot ms;
+    ms.status = MEM_SPECIAL + MEM_LOCKED;
+    ms.nei = nei;
+    ms.partial_lh = nei->partial_lh;
+    ms.scale_num = nei->scale_num;
+    push_back(ms);
+    nei_id_map[nei] = size()-1;
+}
+
+void MemSlotVector::eraseSpecialNei() {
+    if (Params::getInstance().lh_mem_save != LM_MEM_SAVE)
+        return;
+    while (back().status & MEM_SPECIAL) {
+        nei_id_map.erase(back().nei);
+        pop_back();
+    }
+}
+
+
+bool MemSlotVector::lock(PhyloNeighbor *nei) {
+    if (Params::getInstance().lh_mem_save != LM_MEM_SAVE)
+        return false;
+    if (nei->node->isLeaf())
+        return false;
+    iterator id = findNei(nei);
+    if (id->status & MEM_SPECIAL)
+        return false;
+    assert((id->status & MEM_LOCKED) == 0);
+    id->status |= MEM_LOCKED;
+    return true;
+}
+
+void MemSlotVector::unlock(PhyloNeighbor *nei) {
+    if (Params::getInstance().lh_mem_save != LM_MEM_SAVE)
+        return;
+    if (nei->node->isLeaf())
+        return;
+    iterator id = findNei(nei);
+    if (id->status & MEM_SPECIAL)
+        return;
+    assert((id->status & MEM_LOCKED) != 0);
+    id->status &= ~MEM_LOCKED;
+}
+
+bool MemSlotVector::locked(PhyloNeighbor *nei) {
+    if (Params::getInstance().lh_mem_save != LM_MEM_SAVE)
+        return false;
+    if (nei->node->isLeaf())
+        return false;
+    iterator id = findNei(nei);
+
+    if (id->status & MEM_SPECIAL)
+        return false;
+
+    if ((id->status & MEM_LOCKED) == 0)
+        return false;
+    else
+        return true;
+}
+
+int MemSlotVector::allocate(PhyloNeighbor *nei) {
+    if (Params::getInstance().lh_mem_save != LM_MEM_SAVE)
+        return -1;
+
+    // first find a free slot
+    if (free_count < size() && (at(free_count).status & MEM_SPECIAL) == 0) {
+        iterator it = begin() + free_count;
+        assert(it->nei == NULL);
+        addNei(nei, it);
+        free_count++;
+        return it-begin();
+    }
+
+    int min_size = INT_MAX;
+    iterator best = end();
+
+
+    // no free slot found, find an unlocked slot with minimal size
+    for (iterator it = begin(); it != end(); it++)
+        if ((it->status & MEM_LOCKED) == 0 && (it->status & MEM_SPECIAL) == 0 && min_size > it->nei->size) {
+            best = it;
+            min_size = it->nei->size;
+            // 2 is the minimum size
+            if (min_size == 2)
+                break;
+        }
+
+    if (best == end())
+        return -1;
+
+    // clear mem assigned to it->nei
+    best->nei->clearPartialLh();
+
+    // assign mem to nei
+    addNei(nei, best);
+    return best-begin();
+
+}
+
+void MemSlotVector::update(PhyloNeighbor *nei) {
+    if (Params::getInstance().lh_mem_save != LM_MEM_SAVE)
+        return;
+
+    iterator it = findNei(nei);
+//    if (it->status & MEM_SPECIAL)
+//        return;
+    if (it->nei != nei) {
+        // clear mem assigned to it->nei
+        it->nei->clearPartialLh();
+
+        // assign mem to nei
+        addNei(nei, it);
+    }
+}
+
+/*
+void MemSlotVector::cleanup() {
+    if (Params::getInstance().lh_mem_save != LM_MEM_SAVE)
+        return;
+    unordered_map<PhyloNeighbor*, iterator> new_map;
+    for (auto it = nei_id_map.begin(); it != nei_id_map.end(); it++)
+        if (it->first != it->second->nei) {
+            it->first->partial_lh_computed &= ~1; // clear bit
+            it->first->partial_lh = NULL;
+            it->first->scale_num = NULL;
+        } else {
+            new_map[it->first] = it->second;
+        }
+    nei_id_map = new_map;
+    assert(nei_id_map.size() == size());
+}
+*/
+
+void MemSlotVector::takeover(PhyloNeighbor *nei, PhyloNeighbor *taken_nei) {
+    assert(taken_nei->partial_lh);
+    nei->partial_lh = taken_nei->partial_lh;
+    nei->scale_num = taken_nei->scale_num;
+    taken_nei->partial_lh = NULL;
+    taken_nei->scale_num = NULL;
+    taken_nei->partial_lh_computed &= ~1; // clear bit
+    if (Params::getInstance().lh_mem_save != LM_MEM_SAVE)
+        return;
+    iterator id = findNei(taken_nei);
+//    if (id->status & MEM_SPECIAL)
+//        return;
+    nei_id_map.erase(nei_id_map.find(taken_nei));
+    nei_id_map[nei] = id - begin();
+    if (id->nei == taken_nei) {
+        id->nei = nei;
+    }
+}
+
+void MemSlotVector::replace(PhyloNeighbor *new_nei, PhyloNeighbor *old_nei) {
+    if (Params::getInstance().lh_mem_save != LM_MEM_SAVE)
+        return;
+    iterator it = findNei(old_nei);
+    assert(it->partial_lh == old_nei->partial_lh);
+    it->saved_nei = it->nei;
+    it->nei = new_nei;
+    it->partial_lh = new_nei->partial_lh;
+    it->scale_num = new_nei->scale_num;
+    it->status = MEM_LOCKED + MEM_SPECIAL;
+    nei_id_map[new_nei] = it-begin();
+//    nei_id_map.erase(old_nei);
+    cout << "slot " << distance(begin(), it) << " replaced" << endl;
+}
+
+void MemSlotVector::restore(PhyloNeighbor *new_nei, PhyloNeighbor *old_nei) {
+    if (Params::getInstance().lh_mem_save != LM_MEM_SAVE)
+        return;
+    iterator it = findNei(new_nei);
+    assert(it->nei == new_nei);
+    assert(nei_id_map[old_nei] == it-begin());
+    it->nei = it->saved_nei;
+    it->saved_nei = NULL;
+    it->partial_lh = old_nei->partial_lh;
+    it->scale_num = old_nei->scale_num;
+    it->status = 0;
+    nei_id_map.erase(new_nei);
+//    nei_id_map[old_nei] = it;
+    cout << "slot " << distance(begin(), it) << " restored" << endl;
+}
diff --git a/memslot.h b/memslot.h
new file mode 100644
index 0000000..dfb3f3f
--- /dev/null
+++ b/memslot.h
@@ -0,0 +1,111 @@
+/***************************************************************************
+ *   Copyright (C) 2009-2016 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *                                                                         *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef MEMSLOT_H
+#define MEMSLOT_H
+
+#ifndef PHYLOTREE_H
+#error "Please #include phylotree.h before including this header file" 
+#endif
+
+/**
+    one memory slot, used for memory saving technique
+*/
+struct MemSlot {
+    int status; // status of this slot
+    PhyloNeighbor *nei; // neighbor assigned to this slot
+    double *partial_lh; // partial_lh assigned to this slot
+    UBYTE *scale_num; // scale_num assigned to this slot
+
+    PhyloNeighbor *saved_nei;
+};
+
+/**
+    all memory slots, used for memory saving technique
+*/
+class MemSlotVector : public vector<MemSlot> {
+public:
+
+    /** initialize with a specified number of slots */
+    void init(PhyloTree *tree, int num_slot);
+
+    /** 
+        lock the memory assigned to nei
+        @param nei neighbor to lock
+        @return TRUE if successfully locked, FALSE otherwise 
+    */
+    bool lock(PhyloNeighbor *nei);
+
+    /** unlock the memory assigned to nei */
+    void unlock(PhyloNeighbor *nei);
+
+    /** test if the memory assigned to nei is locked or not */
+    bool locked(PhyloNeighbor *nei);
+
+    /** allocate free or unlocked memory to nei */
+    int allocate(PhyloNeighbor *nei);
+
+    /** update neighbor */
+    void update(PhyloNeighbor *nei);
+
+    /** find ID the a neighbor */
+    iterator findNei(PhyloNeighbor *nei);
+
+    /** add neighbor into a specified iterator */
+    void addNei(PhyloNeighbor *nei, iterator it);
+
+    /** reset everything */
+    void reset();
+
+    /** clean up all neighbors where partial_lh_computed = 0 */
+    void cleanup();
+
+    /** take over neighbor from another one */
+    void takeover(PhyloNeighbor *nei, PhyloNeighbor *taken_nei);
+
+    /** add special neihbor e.g. for NNI */
+    void addSpecialNei(PhyloNeighbor *nei);
+
+    /** erase special neihbor e.g. for NNI */
+    void eraseSpecialNei();
+
+    /** replace a neighbor, used for NNI */
+    void replace(PhyloNeighbor *new_nei, PhyloNeighbor *old_nei);
+
+    /** restore neighbor, after calling replace */
+    void restore(PhyloNeighbor *new_nei, PhyloNeighbor *old_nei);
+
+protected:
+
+
+    /** 
+        map from neighbor to slot ID for fast lookup
+        IMPORTANT: mapping to ID instead of (unsafe) iterator
+    */
+    unordered_map<PhyloNeighbor*, int> nei_id_map;
+
+    /** counter of free slot ID */
+    int free_count;
+
+};
+
+
+#endif // MEMSLOT_H
diff --git a/mexttree.cpp b/mexttree.cpp
index d9ee2f7..29f5789 100644
--- a/mexttree.cpp
+++ b/mexttree.cpp
@@ -59,7 +59,7 @@ void MExtTree::generateRandomTree(TreeGenType tree_type, Params &params, bool bi
 
 void MExtTree::setZeroInternalBranches(int num_zero_len) {
 	NodeVector nodes, nodes2;
-	getAllInnerBranches(nodes, nodes2);
+	generateNNIBraches(nodes, nodes2);
 	if (num_zero_len > nodes.size()) outError("The specified number of zero branches is too much");
 	for (int i = 0; i < num_zero_len;) {
 		int id = random_int(nodes.size());
@@ -367,10 +367,79 @@ void MExtTree::generateYuleHarding(Params &params, bool binary) {
 
 }
 
+void MExtTree::generateConstrainedYuleHarding(Params &params, MTree* constraint_tree, StrVector &taxnames) {
+	int size = taxnames.size();
+	if (size < 3)
+		outError(ERR_FEW_TAXA);
+	NodeVector myleaves;
+	NodeVector innodes;
+    StrVector names;
+    StringIntMap namemap;
+    StrVector::iterator it;
+    
+    // copy constraint tree and resolve multifurcation
+    copyTree(constraint_tree);
+    resolveMultifurcation();
+    
+    getTaxa(myleaves);
+    getTaxaName(names);
+    for (it = names.begin(); it != names.end(); it++)
+        namemap[*it] = 1;
+
+    // add the remaining taxa names
+    for (it = taxnames.begin(); it != taxnames.end(); it++)
+        if (namemap.find(*it) == namemap.end())
+            names.push_back(*it);
+    assert(names.size() == taxnames.size());
+    my_random_shuffle(names.begin()+leafNum, names.end());
+
+	// additionally add a leaf
+	for (; leafNum < size; leafNum++)
+	{
+		int index;
+		index = random_int(leafNum);
+        Node *leaf = myleaves[index];
+        Node *dad = leaf->neighbors[0]->node;
+        // add the first leaf
+        
+        Node *newleaf = newNode(leafNum, names[leafNum].c_str());
+        Node *node = newNode();
+
+        // redirect the current leaf
+        node->addNeighbor(leaf, -1.0);
+        leaf->updateNeighbor(dad, node);
+        
+        // add the new leaf
+        node->addNeighbor(newleaf, -1.0);
+        newleaf->addNeighbor(node, -1.0);
+
+        // connect dad and new node
+        dad->updateNeighbor(leaf, node);
+        node->addNeighbor(dad, -1.0);
+
+        myleaves.push_back(newleaf);
+	}
+
+    // assign random branch lengths
+    myleaves.clear();
+    innodes.clear();
+    getBranches(myleaves, innodes);
+    for (int i = 0; i < myleaves.size(); i++) {
+        double len = randomLen(params);
+        myleaves[i]->findNeighbor(innodes[i])->length = len;
+        innodes[i]->findNeighbor(myleaves[i])->length = len;
+    }
+    
+
+	nodeNum = leafNum;
+	initializeTree();
+
+}
+
 void MExtTree::generateStarTree(Params &params) {
 	generateYuleHarding(params);
 	NodeVector nodes, nodes2;
-	getAllInnerBranches(nodes, nodes2);
+	generateNNIBraches(nodes, nodes2);
 	for (int i = 0; i < nodes.size(); i++) {
 		nodes[i]->findNeighbor(nodes2[i])->length = 0.0;
 		nodes2[i]->findNeighbor(nodes[i])->length = 0.0;
diff --git a/mexttree.h b/mexttree.h
index b7d501c..bf4f72d 100644
--- a/mexttree.h
+++ b/mexttree.h
@@ -75,6 +75,16 @@ public:
 	void generateYuleHarding(Params &params, bool binary = true);
 
 	/**
+		generate a random tree following Yule-Harding model satisfying a constraint tree
+		@param params program parameters
+		@param binary TRUE if you want to generate a binary tree
+        @param constraint_tree a (multifurcating) constraint tree
+        @param taxnames taxa names
+	*/
+	void generateConstrainedYuleHarding(Params &params, MTree* constraint_tree, StrVector &taxnames);
+
+
+	/**
 		generate a random tree following uniform model
 		@param size number of taxa
 		@param binary TRUE if you want to generate a binary tree
diff --git a/model/modelcodon.cpp b/model/modelcodon.cpp
index e866e63..4e8a91d 100644
--- a/model/modelcodon.cpp
+++ b/model/modelcodon.cpp
@@ -544,6 +544,14 @@ void ModelCodon::computeRateAttributes() {
     }
     
     if (verbose_mode >= VB_MAX) {
+
+        // make cost matrix fulfill triangular inequality
+        for (int k = 0; k < 20; k++)
+            for (i = 0; i < 20; i++)
+                for (j = 0; j < 20; j++)
+                    if (aa_cost_change[i*20+j] > aa_cost_change[i*20+k] + aa_cost_change[k*20+j])
+                        aa_cost_change[i*20+j] = aa_cost_change[i*20+k] + aa_cost_change[k*20+j];
+
         cout << "cost matrix by number of nt changes for TNT use" << endl;
         cout << "smatrix =1 (aa_nt_changes)";
         for (i = 0; i < 19; i++)
diff --git a/model/modelfactory.cpp b/model/modelfactory.cpp
index 8bbe92b..88cb0ab 100644
--- a/model/modelfactory.cpp
+++ b/model/modelfactory.cpp
@@ -338,6 +338,8 @@ ModelFactory::ModelFactory(Params &params, PhyloTree *tree, ModelsBlock *models_
 		delete [] rates;
 		delete [] state_freq;
 
+        models->joinEigenMemory();
+
         // delete information of the old alignment
 //        tree->aln->ordered_pattern.clear();
 //        tree->deleteAllPartialLh();
@@ -358,15 +360,21 @@ ModelFactory::ModelFactory(Params &params, PhyloTree *tree, ModelsBlock *models_
 //		if (unobserved_ptns.size() <= 0)
 //			outError("Invalid use of +ASC because all constant patterns are observed in the alignment");
 		if (tree->aln->frac_invariant_sites > 0) {
-            cerr << tree->aln->frac_invariant_sites*tree->aln->getNSite() << " invariant sites are observed in the alignment (see below)" << endl;
-            for (Alignment::iterator pit = tree->aln->begin(); pit != tree->aln->end(); pit++)
-                if (pit->isInvariant()) {
-                    string pat_str = "";
-                    for (Pattern::iterator it = pit->begin(); it != pit->end(); it++)
-                        pat_str += tree->aln->convertStateBackStr(*it);
-                    cerr << pat_str << " is invariant site pattern" << endl;
-                }
-            outError("Invalid use of +ASC in the presence of invariant sites");
+//            cerr << tree->aln->frac_invariant_sites*tree->aln->getNSite() << " invariant sites observed in the alignment" << endl;
+//            for (Alignment::iterator pit = tree->aln->begin(); pit != tree->aln->end(); pit++)
+//                if (pit->isInvariant()) {
+//                    string pat_str = "";
+//                    for (Pattern::iterator it = pit->begin(); it != pit->end(); it++)
+//                        pat_str += tree->aln->convertStateBackStr(*it);
+//                    cerr << pat_str << " is invariant site pattern" << endl;
+//                }
+            if (!params.partition_file) {                
+                string varsites_file = ((string)params.out_prefix + ".varsites.phy");
+                tree->aln->printPhylip(varsites_file.c_str(), false, NULL, false, true);
+                cerr << "For your convenience alignment with variable sites printed to " << varsites_file << endl;
+            } 
+            outError("Invalid use of +ASC because of " + convertIntToString(tree->aln->frac_invariant_sites*tree->aln->getNSite()) +
+                " invariant sites in the alignment");
         }
 		cout << "Ascertainment bias correction: " << unobserved_ptns.size() << " unobservable constant patterns"<< endl;
 		rate_str = rate_str.substr(0, posasc) + rate_str.substr(posasc+4);
@@ -678,7 +686,7 @@ double ModelFactory::optimizeAllParameters(double gradient_epsilon) {
 }
 
 double ModelFactory::optimizeParametersGammaInvar(int fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
-    if (!site_rate->isGammai())
+    if (!site_rate->isGammai() || site_rate->isFixPInvar() || site_rate->isFixGammaShape() || site_rate->getTree()->aln->frac_const_sites == 0.0)
         return optimizeParameters(fixed_len, write_info, logl_epsilon, gradient_epsilon);
         
 	double begin_time = getRealTime();
@@ -844,7 +852,7 @@ double ModelFactory::optimizeParameters(int fixed_len, bool write_info,
 	assert(model);
 	assert(site_rate);
 
-    double defaultEpsilon = logl_epsilon;
+//    double defaultEpsilon = logl_epsilon;
 
 	double begin_time = getRealTime();
 	double cur_lh;
@@ -856,7 +864,7 @@ double ModelFactory::optimizeParameters(int fixed_len, bool write_info,
     // no optimization of branch length in the first round
     cur_lh = tree->computeLikelihood();
     tree->setCurScore(cur_lh);
-	if (verbose_mode >= VB_MED || write_info) 
+	if (write_info)
 		cout << "1. Initial log-likelihood: " << cur_lh << endl;
 
 	// For UpperBounds -----------
@@ -899,7 +907,7 @@ double ModelFactory::optimizeParameters(int fixed_len, bool write_info,
 		}
 		if (new_lh > cur_lh + logl_epsilon) {
 			cur_lh = new_lh;
-			if (verbose_mode >= VB_MED || write_info)
+			if (write_info)
 				cout << i << ". Current log-likelihood: " << cur_lh << endl;
 		} else {
 			site_rate->classifyRates(new_lh);
diff --git a/model/modelmixture.cpp b/model/modelmixture.cpp
index bca673d..6b6bfdf 100644
--- a/model/modelmixture.cpp
+++ b/model/modelmixture.cpp
@@ -14,7 +14,9 @@
 #include "modelmorphology.h"
 #include "modelset.h"
 #include "modelmixture.h"
-#include "phylokernelmixture.h"
+//#include "phylokernelmixture.h"
+
+using namespace std;
 
 const string builtin_mixmodels_definition =
 "#nexus\n\
@@ -1426,6 +1428,7 @@ double ModelMixture::optimizeWeights() {
             for (c = 0; c < nmix; c++) {
                 lk_ptn += this_lk_cat[c];
             }
+            assert(lk_ptn != 0.0);
             lk_ptn = phylo_tree->ptn_freq[ptn] / lk_ptn;
             for (c = 0; c < nmix; c++) {
                 new_prop[c] += this_lk_cat[c] * lk_ptn;
@@ -1435,9 +1438,14 @@ double ModelMixture::optimizeWeights() {
         double new_pinvar = 0.0;    
         for (c = 0; c < nmix; c++) {
             new_prop[c] /= phylo_tree->getAlnNSite();
+            // Make sure that probabilities do not get zero
+            if (new_prop[c] < 1e-10) new_prop[c] = 1e-10;
             // check for convergence
             converged = converged && (fabs(prop[c]-new_prop[c]) < 1e-4);
             ratio_prop[c] = new_prop[c] / prop[c];
+            if (std::isnan(ratio_prop[c])) {
+                cerr << "BUG: " << new_prop[c] << " " << prop[c] << " " << ratio_prop[c] << endl;
+            }
             prop[c] = new_prop[c];
             new_pinvar += prop[c];
         }
@@ -1474,7 +1482,8 @@ double ModelMixture::optimizeWithEM(double gradient_epsilon) {
     
     tree->copyPhyloTree(phylo_tree);
     tree->optimize_by_newton = phylo_tree->optimize_by_newton;
-    tree->setLikelihoodKernel(phylo_tree->sse);
+    tree->setParams(phylo_tree->params);
+    tree->setLikelihoodKernel(phylo_tree->sse, phylo_tree->num_threads);
     // initialize model
     ModelFactory *model_fac = new ModelFactory();
     model_fac->joint_optimize = phylo_tree->params->optimize_model_rate_joint;
@@ -1506,6 +1515,7 @@ double ModelMixture::optimizeWithEM(double gradient_epsilon) {
             for (c = 0; c < nmix; c++) {
                 lk_ptn += this_lk_cat[c];
             }
+            assert(lk_ptn != 0.0);
             lk_ptn = phylo_tree->ptn_freq[ptn] / lk_ptn;
             
             // transform _pattern_lh_cat into posterior probabilities of each category
@@ -1523,6 +1533,7 @@ double ModelMixture::optimizeWithEM(double gradient_epsilon) {
             double new_pinvar = 0.0;
             for (c = 0; c < nmix; c++) {
                 new_prop[c] = new_prop[c] / phylo_tree->getAlnNSite();
+                if (new_prop[c] < 1e-10) new_prop[c] = 1e-10;
                 // check for convergence
                 converged = converged && (fabs(prop[c]-new_prop[c]) < 1e-4);
                 prop[c] = new_prop[c];
@@ -1608,6 +1619,17 @@ double ModelMixture::optimizeParameters(double gradient_epsilon) {
 	return score;
 }
 
+bool ModelMixture::isUnstableParameters() {
+    int c, ncategory = size();
+    for (c = 0; c < ncategory; c++)
+        if (prop[c] < MIN_MIXTURE_PROP*0.1) {
+            outWarning("The mixture model might be overfitting because some mixture weights are estimated close to zero");
+            break;
+            return true;
+        }
+    return false;
+}
+
 void ModelMixture::decomposeRateMatrix() {
 	for (iterator it = begin(); it != end(); it++)
 		(*it)->decomposeRateMatrix();
diff --git a/model/modelmixture.h b/model/modelmixture.h
index 0ace5c5..5c3caa3 100644
--- a/model/modelmixture.h
+++ b/model/modelmixture.h
@@ -86,6 +86,12 @@ public:
 	 */
 	virtual int getNMixtures() {return size(); }
 
+ 	/**
+	 * @param cat mixture class
+	 * @return weight of a mixture model component
+	 */
+	virtual double getMixtureWeight(int cat) { return prop[cat]; }
+
 	/**
 		@return the number of dimensions
 	*/
@@ -124,6 +130,11 @@ public:
 	virtual double optimizeParameters(double gradient_epsilon);
 
 	/**
+	 * @return TRUE if parameters are at the boundary that may cause numerical unstability
+	 */
+	virtual bool isUnstableParameters();
+
+	/**
 		decompose the rate matrix into eigenvalues and eigenvectors
 	*/
 	virtual void decomposeRateMatrix();
diff --git a/model/modelpomo.cpp b/model/modelpomo.cpp
index c9c08e2..268508d 100644
--- a/model/modelpomo.cpp
+++ b/model/modelpomo.cpp
@@ -8,11 +8,10 @@
 #include "modelpomo.h"
 
 ModelPoMo::ModelPoMo() {
-	// TODO Auto-generated constructor stub
 
 }
 
 ModelPoMo::~ModelPoMo() {
-	// TODO Auto-generated destructor stub
+
 }
 
diff --git a/model/modelset.cpp b/model/modelset.cpp
index 83c7d1a..83feabb 100644
--- a/model/modelset.cpp
+++ b/model/modelset.cpp
@@ -28,6 +28,8 @@ ModelSet::ModelSet(const char *model_name, PhyloTree *tree) : ModelGTR(tree)
 
 void ModelSet::computeTransMatrix(double time, double* trans_matrix)
 {
+    // TODO not working with vectorization
+    assert(0);
 	for (iterator it = begin(); it != end(); it++) {
 		(*it)->computeTransMatrix(time, trans_matrix);
 		trans_matrix += (num_states * num_states);
@@ -36,6 +38,8 @@ void ModelSet::computeTransMatrix(double time, double* trans_matrix)
 
 void ModelSet::computeTransMatrixFreq(double time, double* trans_matrix)
 {
+    // TODO not working with vectorization
+    assert(0);
 	for (iterator it = begin(); it != end(); it++) {
 		(*it)->computeTransMatrixFreq(time, trans_matrix);
 		trans_matrix += (num_states * num_states);
@@ -44,6 +48,8 @@ void ModelSet::computeTransMatrixFreq(double time, double* trans_matrix)
 
 void ModelSet::computeTransDerv(double time, double* trans_matrix, double* trans_derv1, double* trans_derv2)
 {
+    // TODO not working with vectorization
+    assert(0);
 	for (iterator it = begin(); it != end(); it++) {
 		(*it)->computeTransDerv(time, trans_matrix, trans_derv1, trans_derv2);
 		trans_matrix += (num_states * num_states);
@@ -54,6 +60,8 @@ void ModelSet::computeTransDerv(double time, double* trans_matrix, double* trans
 
 void ModelSet::computeTransDervFreq(double time, double rate_val, double* trans_matrix, double* trans_derv1, double* trans_derv2)
 {
+    // TODO not working with vectorization
+    assert(0);
 	for (iterator it = begin(); it != end(); it++) {
 		(*it)->computeTransDervFreq(time, rate_val, trans_matrix, trans_derv1, trans_derv2);
 		trans_matrix += (num_states * num_states);
@@ -71,12 +79,50 @@ int ModelSet::getPtnModelID(int ptn)
 
 
 double ModelSet::computeTrans(double time, int model_id, int state1, int state2) {
-	return at(model_id)->computeTrans(time, state1, state2);
+    if (phylo_tree->vector_size == 1)
+        return at(model_id)->computeTrans(time, state1, state2);
+	// temporary fix problem with vectorized eigenvectors
+	int i;
+    int vsize = phylo_tree->vector_size;
+    int states_vsize = num_states*vsize;
+    int model_vec_id = model_id % vsize;
+    int start_ptn = model_id - model_vec_id;
+    double *evec = &eigenvectors[start_ptn*num_states*num_states + model_vec_id + state1*num_states*vsize];
+    double *inv_evec = &inv_eigenvectors[start_ptn*num_states*num_states + model_vec_id + state2*vsize];
+    double *eval = &eigenvalues[start_ptn*num_states + model_vec_id];
+	double trans_prob = 0.0;
+	for (i = 0; i < states_vsize; i+=vsize) {
+        double val = eval[i];
+		double trans = evec[i] * inv_evec[i*num_states] * exp(time * val);
+		trans_prob += trans;
+	}
+	return trans_prob;
 }
 
 double ModelSet::computeTrans(double time, int model_id, int state1, int state2, double &derv1, double &derv2) {
-	return at(model_id)->computeTrans(time, state1, state2, derv1, derv2);
-	
+    if (phylo_tree->vector_size == 1)
+        return at(model_id)->computeTrans(time, state1, state2, derv1, derv2);
+
+	// temporary fix problem with vectorized eigenvectors
+	int i;
+    int vsize = phylo_tree->vector_size;
+    int states_vsize = num_states*vsize;
+    int model_vec_id = model_id % vsize;
+    int start_ptn = model_id - model_vec_id;
+    double *evec = &eigenvectors[start_ptn*num_states*num_states + model_vec_id + state1*num_states*vsize];
+    double *inv_evec = &inv_eigenvectors[start_ptn*num_states*num_states + model_vec_id + state2*vsize];
+    double *eval = &eigenvalues[start_ptn*num_states + model_vec_id];
+	double trans_prob = 0.0;
+	derv1 = derv2 = 0.0;
+	for (i = 0; i < states_vsize; i+=vsize) {
+        double val = eval[i];
+		double trans = evec[i] * inv_evec[i*num_states] * exp(time * val);
+		double trans2 = trans * val;
+		trans_prob += trans;
+		derv1 += trans2;
+		derv2 += trans2 * val;
+	}
+	return trans_prob;
 }
 
 int ModelSet::getNDim()
@@ -87,8 +133,9 @@ int ModelSet::getNDim()
 
 void ModelSet::writeInfo(ostream& out)
 {
-	assert(size());
-	if (verbose_mode >= VB_MED) {
+    if (empty())
+        return;
+	if (verbose_mode >= VB_DEBUG) {
 		int i = 1;
 		for (iterator it = begin(); it != end(); it++, i++) {
 			out << "Partition " << i << ":" << endl;
@@ -101,8 +148,40 @@ void ModelSet::writeInfo(ostream& out)
 
 void ModelSet::decomposeRateMatrix()
 {
+    if (empty())
+        return;
 	for (iterator it = begin(); it != end(); it++)
 		(*it)->decomposeRateMatrix();
+	if (phylo_tree->vector_size == 1)
+		return;
+	// rearrange eigen to obey vector_size
+	size_t vsize = phylo_tree->vector_size;
+	size_t states2 = num_states*num_states;
+	size_t ptn, i, x;
+    double new_eval[num_states*vsize];
+    double new_evec[states2*vsize];
+    double new_inv_evec[states2*vsize];
+
+	for (ptn = 0; ptn < size(); ptn += vsize) {
+		double *eval_ptr = &eigenvalues[ptn*num_states];
+		double *evec_ptr = &eigenvectors[ptn*states2];
+		double *inv_evec_ptr = &inv_eigenvectors[ptn*states2];
+		for (i = 0; i < vsize; i++) {
+			for (x = 0; x < num_states; x++)
+				new_eval[x*vsize+i] = eval_ptr[x];
+			for (x = 0; x < states2; x++) {
+				new_evec[x*vsize+i] = evec_ptr[x];
+				new_inv_evec[x*vsize+i] = inv_evec_ptr[x];
+			}
+			eval_ptr += num_states;
+			evec_ptr += states2;
+			inv_evec_ptr += states2;
+		}
+		// copy new values
+        memcpy(&eigenvalues[ptn*num_states], new_eval, sizeof(double)*num_states*vsize);
+        memcpy(&eigenvectors[ptn*states2], new_evec, sizeof(double)*states2*vsize);
+        memcpy(&inv_eigenvectors[ptn*states2], new_inv_evec, sizeof(double)*states2*vsize);
+	}
 }
 
 
@@ -124,8 +203,49 @@ void ModelSet::setVariables(double* variables)
 
 ModelSet::~ModelSet()
 {
-    for (reverse_iterator rit = rbegin(); rit != rend(); rit++)
-        delete (*rit);
-    
+	for (reverse_iterator rit = rbegin(); rit != rend(); rit++) {
+		(*rit)->eigenvalues = NULL;
+		(*rit)->eigenvectors = NULL;
+		(*rit)->inv_eigenvectors = NULL;
+		delete (*rit);
+	}
 }
 
+void ModelSet::joinEigenMemory() {
+    size_t nmixtures = get_safe_upper_limit(size());
+	if (eigenvalues) aligned_free(eigenvalues);
+	if (eigenvectors) aligned_free(eigenvectors);
+	if (inv_eigenvectors) aligned_free(inv_eigenvectors);
+
+    size_t states2 = num_states*num_states;
+
+	eigenvalues = aligned_alloc<double>(num_states*nmixtures);
+	eigenvectors = aligned_alloc<double>(states2*nmixtures);
+	inv_eigenvectors = aligned_alloc<double>(states2*nmixtures);
+
+	// assigning memory for individual models
+	size_t m = 0;
+	for (iterator it = begin(); it != end(); it++, m++) {
+        // first copy memory for eigen stuffs
+        memcpy(&eigenvalues[m*num_states], (*it)->eigenvalues, num_states*sizeof(double));
+        memcpy(&eigenvectors[m*states2], (*it)->eigenvectors, states2*sizeof(double));
+        memcpy(&inv_eigenvectors[m*states2], (*it)->inv_eigenvectors, states2*sizeof(double));
+        // then delete
+		if ((*it)->eigenvalues) aligned_free((*it)->eigenvalues);
+		if ((*it)->eigenvectors) aligned_free((*it)->eigenvectors);
+		if ((*it)->inv_eigenvectors) aligned_free((*it)->inv_eigenvectors);
+//		if ((*it)->eigen_coeff) aligned_free((*it)->eigen_coeff);
+
+        // and assign new memory
+		(*it)->eigenvalues = &eigenvalues[m*num_states];
+		(*it)->eigenvectors = &eigenvectors[m*states2];
+		(*it)->inv_eigenvectors = &inv_eigenvectors[m*states2];
+	}
+
+    // copy dummy values
+    for (m = size(); m < nmixtures; m++) {
+        memcpy(&eigenvalues[m*num_states], &eigenvalues[(m-1)*num_states], sizeof(double)*num_states);
+        memcpy(&eigenvectors[m*states2], &eigenvectors[(m-1)*states2], sizeof(double)*states2);
+        memcpy(&inv_eigenvectors[m*states2], &inv_eigenvectors[(m-1)*states2], sizeof(double)*states2);
+    }
+}
diff --git a/model/modelset.h b/model/modelset.h
index 6c2ae9f..18c59b1 100644
--- a/model/modelset.h
+++ b/model/modelset.h
@@ -168,7 +168,12 @@ public:
 
 	/** map from pattern ID to model ID */
 	IntVector pattern_model_map;
-	
+
+    /**
+        join memory for eigen into one chunk
+    */
+    void joinEigenMemory();
+
 protected:
 	
 	
diff --git a/model/modelsubst.h b/model/modelsubst.h
index a2ca4ba..4e9dc4f 100644
--- a/model/modelsubst.h
+++ b/model/modelsubst.h
@@ -76,6 +76,12 @@ public:
 	 */
 	virtual int getNMixtures() { return 1; }
 
+ 	/**
+	 * @param cat mixture class
+	 * @return weight of a mixture model component
+	 */
+	virtual double getMixtureWeight(int cat) { return 1.0; }
+
 	/**
 		@return the number of rate entries, equal to the number of elements
 			in the upper-diagonal of the rate matrix (since model is reversible)
diff --git a/model/partitionmodel.cpp b/model/partitionmodel.cpp
index a2942d6..8108cb8 100644
--- a/model/partitionmodel.cpp
+++ b/model/partitionmodel.cpp
@@ -137,7 +137,7 @@ double PartitionModel::optimizeLinkedAlpha(bool write_info, double gradient_epsi
 	double negative_lh;
 	double current_shape = linked_alpha;
 	double ferror, optx;
-	optx = minimizeOneDimen(MIN_GAMMA_SHAPE, current_shape, MAX_GAMMA_SHAPE, max(gradient_epsilon, TOL_GAMMA_SHAPE), &negative_lh, &ferror);
+	optx = minimizeOneDimen(site_rate->getTree()->params->min_gamma_shape, current_shape, MAX_GAMMA_SHAPE, max(gradient_epsilon, TOL_GAMMA_SHAPE), &negative_lh, &ferror);
     if (write_info)
         cout << "Linked alpha across partitions: " << linked_alpha << endl;
 	return site_rate->getTree()->computeLikelihood();
@@ -151,7 +151,7 @@ double PartitionModel::optimizeParameters(int fixed_len, bool write_info, double
 
     if (tree->part_order.empty()) tree->computePartitionOrder();
 	#ifdef _OPENMP
-	#pragma omp parallel for reduction(+: tree_lh) schedule(dynamic) if(ntrees >= tree->params->num_threads)
+	#pragma omp parallel for reduction(+: tree_lh) schedule(dynamic) if(tree->num_threads > 1)
 	#endif
     for (int i = 0; i < ntrees; i++) {
         int part = tree->part_order[i];
@@ -183,7 +183,7 @@ double PartitionModel::optimizeParametersGammaInvar(int fixed_len, bool write_in
 
     if (tree->part_order.empty()) tree->computePartitionOrder();
 	#ifdef _OPENMP
-	#pragma omp parallel for reduction(+: tree_lh) schedule(dynamic) if(ntrees >= tree->params->num_threads)
+	#pragma omp parallel for reduction(+: tree_lh) schedule(dynamic) if(tree->num_threads > 1)
 	#endif
     for (int i = 0; i < ntrees; i++) {
         int part = tree->part_order[i];
diff --git a/model/ratefree.cpp b/model/ratefree.cpp
index 4e84f81..60c302f 100644
--- a/model/ratefree.cpp
+++ b/model/ratefree.cpp
@@ -7,6 +7,7 @@
 
 #include "phylotree.h"
 #include "ratefree.h"
+#include "rateinvar.h"
 
 #include "model/modelfactory.h"
 #include "model/modelmixture.h"
@@ -209,7 +210,7 @@ double RateFree::optimizeParameters(double gradient_epsilon) {
 		cout << "Optimizing " << name << " model parameters by " << optimize_alg << " algorithm..." << endl;
 
     // TODO: turn off EM algorithm for +ASC model
-    if (optimize_alg.find("EM") != string::npos && phylo_tree->getModelFactory()->unobserved_ptns.empty())
+    if ((optimize_alg.find("EM") != string::npos && phylo_tree->getModelFactory()->unobserved_ptns.empty()) || getPInvar() <= MIN_PINVAR)
         return optimizeWithEM();
 
 	//if (freq_type == FREQ_ESTIMATE) scaleStateFreq(false);
@@ -457,9 +458,16 @@ double RateFree::optimizeWithEM() {
 //    double *lk_ptn = aligned_alloc<double>(nptn);
     double *new_prop = aligned_alloc<double>(nmix);
     PhyloTree *tree = new PhyloTree;
+
+    // attach memory to save space
+//    tree->central_partial_lh = phylo_tree->central_partial_lh;
+//    tree->central_scale_num = phylo_tree->central_scale_num;
+//    tree->central_partial_pars = phylo_tree->central_partial_pars;
+
     tree->copyPhyloTree(phylo_tree);
     tree->optimize_by_newton = phylo_tree->optimize_by_newton;
-    tree->setLikelihoodKernel(phylo_tree->sse);
+    tree->setParams(phylo_tree->params);
+    tree->setLikelihoodKernel(phylo_tree->sse, phylo_tree->num_threads);
     // initialize model
     ModelFactory *model_fac = new ModelFactory();
     model_fac->joint_optimize = phylo_tree->params->optimize_model_rate_joint;
@@ -543,7 +551,7 @@ double RateFree::optimizeWithEM() {
 
         new_pinvar = 1.0 - new_pinvar;
 
-        if (new_pinvar != 0.0) {
+        if (new_pinvar > 1e-4 && getPInvar() != 0.0) {
             converged = converged && (fabs(getPInvar()-new_pinvar) < 1e-4);
             setPInvar(new_pinvar);
 //            setOptimizePInvar(false);
@@ -564,8 +572,8 @@ double RateFree::optimizeWithEM() {
             tree->setModel(subst_model);
             subst_model->setTree(tree);
             model_fac->model = subst_model;
-            if (subst_model->isMixture())
-                tree->setLikelihoodKernel(phylo_tree->sse);
+            if (subst_model->isMixture() || subst_model->isSiteSpecificModel())
+                tree->setLikelihoodKernel(phylo_tree->sse, phylo_tree->num_threads);
 
                         
             // initialize likelihood
@@ -591,6 +599,11 @@ double RateFree::optimizeWithEM() {
         if (converged) break;
     }
     
+    // deattach memory
+//    tree->central_partial_lh = NULL;
+//    tree->central_scale_num = NULL;
+//    tree->central_partial_pars = NULL;
+
     delete tree;
     aligned_free(new_prop);
     return phylo_tree->computeLikelihood();
diff --git a/model/rategamma.cpp b/model/rategamma.cpp
index 030abdc..0b9f5e8 100644
--- a/model/rategamma.cpp
+++ b/model/rategamma.cpp
@@ -31,14 +31,14 @@ RateGamma::RateGamma(int ncat, double shape, bool median, PhyloTree *tree) : Rat
 	phylo_tree = tree;
 	cut_median = median;
 	//gamma_shape = MAX_GAMMA_SHAPE-1.0;
-	gamma_shape = max(MIN_GAMMA_SHAPE, fabs(shape));
+	gamma_shape = max(tree->params->min_gamma_shape, fabs(shape));
 	fix_gamma_shape = false;
 	rates = NULL;
 	if (shape > 0.0) {
 //		gamma_shape = shape;
 		fix_gamma_shape = true;
 	} else if (shape == 0.0) {
-		gamma_shape = max(MIN_GAMMA_SHAPE, random_double() * 10.0);
+		gamma_shape = max(tree->params->min_gamma_shape, random_double() * 10.0);
 		cout << "Randomize initial gamma shape (alpha): " << gamma_shape << endl;
 	}
 	setNCategory(ncat);
@@ -190,7 +190,7 @@ double RateGamma::targetFunk(double x[]) {
 
 void RateGamma::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) {
 	if (getNDim() == 0) return;
-	lower_bound[1] = MIN_GAMMA_SHAPE;
+	lower_bound[1] = phylo_tree->params->min_gamma_shape;
 	upper_bound[1] = MAX_GAMMA_SHAPE;
 	bound_check[1] = false;
 }
@@ -233,7 +233,7 @@ double RateGamma::optimizeParameters(double gradient_epsilon) {
 	double negative_lh;
 	double current_shape = gamma_shape;
 	double ferror, optx;
-	optx = minimizeOneDimen(MIN_GAMMA_SHAPE, current_shape, MAX_GAMMA_SHAPE, max(gradient_epsilon, TOL_GAMMA_SHAPE), &negative_lh, &ferror);
+	optx = minimizeOneDimen(phylo_tree->params->min_gamma_shape, current_shape, MAX_GAMMA_SHAPE, max(gradient_epsilon, TOL_GAMMA_SHAPE), &negative_lh, &ferror);
 //	gamma_shape = optx;
 //	computeRates();
 //	phylo_tree->clearAllPartialLH();
diff --git a/model/rategamma.h b/model/rategamma.h
index 19fc809..3ecbb98 100644
--- a/model/rategamma.h
+++ b/model/rategamma.h
@@ -22,12 +22,6 @@
 
 #include "rateheterogeneity.h"
 
-const double MIN_GAMMA_RATE = 1e-6;
-// change from 0.01 to 0.02 as 0.01 causes numerical problems
-const double MIN_GAMMA_SHAPE = 0.02;
-const double MAX_GAMMA_SHAPE = 1000.0;
-const double TOL_GAMMA_SHAPE = 0.001;
-
 const int GAMMA_CUT_MEDIAN = 1; // 2 discrete Gamma approximations (mean or median) of Yang 1994
 const int GAMMA_CUT_MEAN   = 2;
 
@@ -182,7 +176,7 @@ public:
 	*/
 	virtual void writeParameters(ostream &out);
 
-	bool isFixGammaShape() const {
+	virtual bool isFixGammaShape() const {
 		return fix_gamma_shape;
 	}
 
diff --git a/model/rategammainvar.cpp b/model/rategammainvar.cpp
index 70ebaa7..55e6a8e 100644
--- a/model/rategammainvar.cpp
+++ b/model/rategammainvar.cpp
@@ -136,7 +136,7 @@ double RateGammaInvar::optimizeParameters(double gradient_epsilon) {
 
 	if (optimize_alg.find("EM_RR") != string::npos) {
         return randomRestartOptimization(gradient_epsilon);
-    } else if (optimize_alg.find("Brent") != string::npos) {
+    } else if (optimize_alg.find("Brent") != string::npos || phylo_tree->aln->frac_const_sites == 0.0 || isFixPInvar() || isFixGammaShape()) {
 		double lh = phylo_tree->computeLikelihood();
 		cur_optimize = 0;
 		double gamma_lh = RateGamma::optimizeParameters(gradient_epsilon);
@@ -177,6 +177,7 @@ double RateGammaInvar::optimizeParameters(double gradient_epsilon) {
     } else {
         string errMsg = "Unknown optimization algorithm: " + optimize_alg;
         outError(errMsg.c_str());
+        return 0.0;
     }
 }
 
diff --git a/model/rateheterogeneity.h b/model/rateheterogeneity.h
index c0c4ab5..1d2dcea 100644
--- a/model/rateheterogeneity.h
+++ b/model/rateheterogeneity.h
@@ -146,7 +146,11 @@ public:
 	*/
 	virtual void setPInvar(double pinv) { }
 
-    /**
+	virtual bool isFixPInvar() const {
+		return true;
+	}
+
+	/**
         set whether to fix p_invar
     */
 	virtual void setFixPInvar(bool fixPInvar) {}
@@ -169,7 +173,11 @@ public:
 	*/	
 	virtual void setGammaShape(double gs) {}
 
-    /**
+	virtual bool isFixGammaShape() const {
+		return true;
+	}
+
+	/**
         set whether to fix gamma shape
     */
 	virtual void setFixGammaShape(bool fixGammaShape) {}
diff --git a/model/rateinvar.cpp b/model/rateinvar.cpp
index 55988c5..0a68d2f 100644
--- a/model/rateinvar.cpp
+++ b/model/rateinvar.cpp
@@ -22,10 +22,13 @@
 RateInvar::RateInvar(double p_invar_sites, PhyloTree *tree)
  : RateHeterogeneity()
 {
-	if (tree)
-		p_invar = max(tree->aln->frac_const_sites/2.0, MIN_PINVAR);
+	if (tree) {
+        if (tree->aln->frac_const_sites == 0.0)
+            p_invar = 0.0;
+        else
+            p_invar = max(tree->aln->frac_const_sites/2.0, MIN_PINVAR);
 //		p_invar = MIN_PINVAR;
-	else
+	} else
 		p_invar = MIN_PINVAR;
 	fix_p_invar = false;
     optimize_p_invar = true;
@@ -85,6 +88,8 @@ void RateInvar::setBounds(double *lower_bound, double *upper_bound, bool *bound_
 }
 
 double RateInvar::optimizeParameters(double gradient_epsilon) {
+    if (phylo_tree->aln->frac_const_sites == 0.0)
+        return -computeFunction(0.0);
 	if (fix_p_invar || !optimize_p_invar)
 		return -computeFunction(p_invar);
 	if (verbose_mode >= VB_MAX)
diff --git a/model/rateinvar.h b/model/rateinvar.h
index 6ae98ed..acf070a 100644
--- a/model/rateinvar.h
+++ b/model/rateinvar.h
@@ -121,7 +121,7 @@ public:
 	*/
 	virtual void writeParameters(ostream &out);
 
-	bool isFixPInvar() const {
+	virtual bool isFixPInvar() const {
 		return fix_p_invar;
 	}
 
diff --git a/mtree.cpp b/mtree.cpp
index 9a39486..ee363d0 100644
--- a/mtree.cpp
+++ b/mtree.cpp
@@ -21,6 +21,7 @@
 #include <iostream>
 //#include <fstream>
 #include <iterator>
+//#include <mtree.h>
 #include "splitgraph.h"
 using namespace std;
 
@@ -65,6 +66,23 @@ MTree::MTree(MTree &tree) {
     init(tree);
 }
 
+MTree::MTree(string& treeString, vector<string>& taxaNames, bool isRooted) {
+    stringstream str;
+    str << treeString;
+    str.seekg(0, ios::beg);
+    readTree(str, isRooted);
+    assignIDs(taxaNames);
+    assignLeafID();
+}
+
+MTree::MTree(string& treeString, bool isRooted) {
+    stringstream str;
+    str << treeString;
+    str.seekg(0, ios::beg);
+    readTree(str, isRooted);
+    assignLeafID();
+}
+
 void MTree::init(MTree &tree) {
     root = tree.root;
     leafNum = tree.leafNum;
@@ -78,6 +96,41 @@ void MTree::init(MTree &tree) {
     fig_char = tree.fig_char;
 }
 
+void MTree::assignIDs(vector<string>& taxaNames) {
+    bool err = false;
+    int nseq = taxaNames.size();
+    for (int seq = 0; seq < nseq; seq++) {
+        string seq_name = taxaNames[seq];
+        Node *node = findLeafName(seq_name);
+        if (!node) {
+            string str = "Sequence ";
+            str += seq_name;
+            str += " does not appear in the tree";
+            err = true;
+            outError(str, false);
+        } else {
+            assert(node->isLeaf());
+            node->id = seq;
+        }
+    }
+    StrVector taxname;
+    getTaxaName(taxname);
+    for (StrVector::iterator it = taxname.begin(); it != taxname.end(); it++) {
+        bool foundTaxa = false;
+        for (vector<string>::iterator it2 = taxaNames.begin(); it2 != taxaNames.end(); it2++) {
+            if ( *it == *it2 ) {
+                foundTaxa = true;
+                break;
+            }
+        }
+        if (!foundTaxa) {
+            outError((string) "Tree taxon " + (*it) + " does not appear in the input taxa names", false);
+            err = true;
+        }
+    }
+    if (err) outError("Tree taxa and input taxa names do not match (see above)");
+}
+
 void MTree::copyTree(MTree *tree) {
     if (root) freeNode();
     stringstream ss;
@@ -153,6 +206,95 @@ Node* MTree::copyTree(MTree *tree, string &taxa_set, double &len, Node *node, No
     return int_node;
 }
 
+void MTree::extractBifurcatingSubTree(Node *node, Node *dad) {
+    if (!node) node = root;
+    if (node->degree() > 3) {
+        int id1, id2, id3;
+        id1 = node->findNeighborIt(dad) - node->neighbors.begin();
+        do {
+            id2 = random_int(node->degree());
+        } while (id2 == id1);
+        
+        // make sure that id1 < id2
+        if (id1 > id2) {
+            int tmp = id1;
+            id1 = id2;
+            id2 = tmp;
+        }
+        do {
+            id3 = random_int(node->degree());
+        } while (id3 == id1 || id3 == id2);
+        //make sure that id1 < id2 < id3
+        if (id3 < id2) {
+            if (id3 < id1) {
+                // id3 < id1 < id2
+                int tmp = id1;
+                id1 = id3;
+                id3 = id2;
+                id2 = tmp;
+            } else {
+                // id1 < id3 < id2
+                int tmp = id2;
+                id2 = id3;
+                id3 = tmp;
+            }
+        }
+        // remove all neighbors except id1, id2, id3
+        for (int i = 0; i != node->neighbors.size(); i++)
+            if (i != id1 && i != id2 && i != id3) {
+                freeNode(node->neighbors[i]->node, node);
+                delete node->neighbors[i];
+            }
+        node->neighbors[0] = node->neighbors[id1];
+        node->neighbors[1] = node->neighbors[id2];
+        node->neighbors[2] = node->neighbors[id3];
+        node->neighbors.erase(node->neighbors.begin()+3, node->neighbors.end());
+    }
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        if (!(*it)->node->isLeaf())
+            extractBifurcatingSubTree((*it)->node, node);
+    }
+}
+
+void MTree::resolveMultifurcation() {
+    // randomly resolve multifurcating node
+
+    NodeVector nodes;
+    getInternalNodes(nodes);
+    for (NodeVector::iterator it = nodes.begin(); it != nodes.end(); it++)
+        while ((*it)->degree() > 3) {
+            Node *new_node = newNode();
+            int id1 = random_int((*it)->degree());
+            int id2;
+            do {
+                id2 = random_int((*it)->degree());
+            } while (id2 == id1);
+            
+            // make sure that id1 < id2
+            if (id1 > id2) {
+                int tmp = id1;
+                id1 = id2;
+                id2 = tmp;
+            }
+            Neighbor *nei1 = (*it)->neighbors[id1];
+            Neighbor *nei2 = (*it)->neighbors[id2];
+            
+            // connect id1 with new_node
+            nei1->node->updateNeighbor((*it), new_node);
+            new_node->neighbors.push_back(nei1);
+            
+            // connect id2 with new_node
+            nei2->node->updateNeighbor((*it), new_node);
+            new_node->neighbors.push_back(nei2);
+            
+            // connect new_node with old node
+            new_node->addNeighbor((*it), -1.0);
+            (*it)->neighbors.erase((*it)->neighbors.begin() + id2);
+            (*it)->neighbors.erase((*it)->neighbors.begin() + id1);
+            (*it)->addNeighbor(new_node, -1.0);
+        }
+}
+
 Node* MTree::newNode(int node_id, const char* node_name) {
     return new Node(node_id, node_name);
 }
@@ -236,12 +378,6 @@ void MTree::printTree(const char *ofile, int brtype)
     }
 }
 
-//string MTree::getTreeString() {
-//	stringstream tree_stream;
-//	printTree(tree_stream);
-//	return tree_stream.str();
-//}
-
 void MTree::printTree(ostream &out, int brtype) {
     if (root->isLeaf()) {
         if (root->neighbors[0]->node->isLeaf()) {
@@ -301,11 +437,14 @@ int MTree::printTree(ostream &out, int brtype, Node *node, Node *dad)
             out << node->name;
 
         if (brtype & WT_BR_LEN) {
+            int prec = 10;
         	out.setf( std::ios::fixed, std:: ios::floatfield ); // some sofware does handle number format like '1.234e-6'
 //            out.precision(10); // increase precision to avoid zero branch (like in RAxML)
         	double len = node->neighbors[0]->length;
             if (brtype & WT_BR_SCALE) len *= len_scale;
+            if (brtype & WT_BR_LEN_SHORT) prec = 6;
             if (brtype & WT_BR_LEN_ROUNDING) len = round(len);
+            out.precision(prec);
             if (brtype & WT_BR_LEN_FIXED_WIDTH)
                 out << ":" << fixed << len;
             else
@@ -499,6 +638,16 @@ void MTree::readTree(const char *infile, bool &is_rooted) {
              " taxa and " << nodeNum-1-is_rooted << " branches" << endl;
 }
 
+/*
+void MTree::readTreeString(string tree_string, bool is_rooted) {
+	stringstream str;
+	str << tree_string;
+	str.seekg(0, ios::beg);
+	freeNode();
+	readTree(str, is_rooted);
+}
+*/
+
 
 void MTree::readTree(istream &in, bool &is_rooted)
 {
@@ -774,7 +923,7 @@ void MTree::getTaxa(NodeVector &taxa, Node *node, Node *dad) {
 }
 
 void MTree::getAllNodesInSubtree(Node *node, Node *dad, NodeVector &nodeList) {
-    assert(node && dad);
+    assert(node);
     nodeList.push_back(node);
     if (node->isLeaf()) {
         return;
@@ -812,13 +961,13 @@ void MTree::getInternalNodes(NodeVector &nodes, Node *node, Node *dad) {
     }
 }
 
-void MTree::getAllInnerBranches(NodeVector &nodes1, NodeVector &nodes2, SplitGraph* excludeSplits, Node *node, Node *dad) {
+void MTree::generateNNIBraches(NodeVector &nodes1, NodeVector &nodes2, SplitGraph* excludeSplits, Node *node, Node *dad) {
     if (!node) node = root;
     //for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
     //if ((*it)->node != dad)	{
     FOR_NEIGHBOR_IT(node, dad, it)
     if (!(*it)->node->isLeaf()) {
-        getAllInnerBranches(nodes1, nodes2, excludeSplits, (*it)->node, node);
+        generateNNIBraches(nodes1, nodes2, excludeSplits, (*it)->node, node);
         if (!node->isLeaf()) {
         	if (excludeSplits != NULL && excludeSplits->size() != 0) {
         		Split* sp = getSplit(node, (*it)->node);
@@ -839,73 +988,54 @@ void MTree::getAllInnerBranches(NodeVector &nodes1, NodeVector &nodes2, SplitGra
     }
 }
 
-bool MTree::branchExist(Node* node1, Node* node2, NodeVector& nodes1, NodeVector& nodes2) {
-	assert(nodes1.size() == nodes2.size());
-	bool existed = false;
-	for (int i = 0; i < nodes1.size(); i++) {
-		if (nodes1[i] == node1) {
-			if (nodes2[i] == node2) {
-				existed = true;
-				break;
-			}
-		}
-		if (nodes1[i] == node2) {
-			if (nodes2[i] == node1) {
-				existed = true;
-				break;
-			}
-		}
-	}
-	return existed;
-}
+//bool MTree::branchExist(Node* node1, Node* node2, NodeVector& nodes1, NodeVector& nodes2) {
+//	assert(nodes1.size() == nodes2.size());
+//	bool existed = false;
+//	for (int i = 0; i < nodes1.size(); i++) {
+//		if (nodes1[i] == node1) {
+//			if (nodes2[i] == node2) {
+//				existed = true;
+//				break;
+//			}
+//		}
+//		if (nodes1[i] == node2) {
+//			if (nodes2[i] == node1) {
+//				existed = true;
+//				break;
+//			}
+//		}
+//	}
+//	return existed;
+//}
 
-void MTree::getInnerBranches(NodeVector &nodes1, NodeVector &nodes2, int depth, Node *node, Node *dad) {
+void MTree::getSurroundingInnerBranches(Node *node, Node *dad, int depth, Branches &surrBranches) {
     if (depth == 0)
       return;
     FOR_NEIGHBOR_IT(node, dad, it) {
-        if (!(*it)->node->isLeaf() && !branchExist(node, (*it)->node, nodes1, nodes2)) {
-        	nodes1.push_back(node);
-        	nodes2.push_back((*it)->node);
-            getInnerBranches(nodes1, nodes2, depth-1, (*it)->node, node);
+        if (!(*it)->node->isLeaf()) {
+            Branch curBranch;
+            curBranch.first = node;
+            curBranch.second = (*it)->node;
+            int branchID = pairInteger(node->id, (*it)->node->id);
+            if (surrBranches.find(branchID) == surrBranches.end())
+                surrBranches.insert(pair<int,Branch>(branchID, curBranch));
+            getSurroundingInnerBranches((*it)->node, node, depth-1, surrBranches);
         }
     }
 }
 
 bool MTree::isInnerBranch(Node* node1, Node* node2) {
-    assert(node1->degree() == 3 && node2->degree() == 3);
-    return (isABranch(node1, node2) && !node1->isLeaf() && !node2->isLeaf());
+    return(node1->degree() >= 3 && node2->degree() >= 3 && isABranch(node1, node2));
 }
 
 bool MTree::isABranch(Node* node1, Node* node2) {
-	bool isBranch1 = false;
-	for (NeighborVec::iterator it = node1->neighbors.begin(); it != node1->neighbors.end(); it++) {
-		if ((*it)->node == node2) {
-			isBranch1 = true;
-			break;
-		}
-	}
-	// Sanity check: both nodes must have each other as neighbors or not at all
-	bool isBranch2 = false;
-	for (NeighborVec::iterator it = node2->neighbors.begin(); it != node2->neighbors.end(); it++) {
-		if ((*it)->node == node1) {
-			isBranch2 = true;
-			break;
-		}
-	}
-	if (isBranch2 != isBranch1) {
-		int node1ID = node1->id;
-		int node2ID = node2->id;
-		stringstream msg;
-		msg << "Tree data structure corrupted! Node " << node1ID << " and node " << node2ID << " are not constructed properly";
-		outError(msg.str());
-	}
-	return isBranch1;
+    return (node1->findNeighbor(node2) != NULL && node2->findNeighbor(node1) != NULL);
 }
 
 void MTree::getBranches(NodeVector &nodes, NodeVector &nodes2, Node *node, Node *dad) {
     if (!node) node = root;
     //for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
-    //if ((*it)->node != dad)	{
+    //if ((*it)->node != dad)   {
     FOR_NEIGHBOR_IT(node, dad, it) {
         if (node->id < (*it)->node->id) {
             nodes.push_back(node);
@@ -918,6 +1048,19 @@ void MTree::getBranches(NodeVector &nodes, NodeVector &nodes2, Node *node, Node
     }
 }
 
+void MTree::getInnerBranches(Branches& branches, Node *node, Node *dad) {
+    if (!node) node = root;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+    	if (isInnerBranch((*it)->node, node)) {
+            Branch branch;
+            branch.first = node;
+            branch.second = (*it)->node;
+            branches.insert(pair<int, Branch>(pairInteger(branch.first->id, branch.second->id), branch));
+    	}
+    	getInnerBranches(branches, (*it)->node, node);
+    }
+}
+
 void MTree::getBranchLengths(DoubleVector &len, Node *node, Node *dad) {
     if (!node) {
         node = root;
@@ -971,6 +1114,18 @@ void MTree::getTaxaName(vector<string> &taxname, Node *node, Node *dad) {
 }
 
 
+void MTree::getUnorderedTaxaName(vector<string> &taxname, Node *node, Node *dad) {
+    if (!node) node = root;
+    if (node->isLeaf()) {
+    	taxname.push_back(node->name);
+    }
+    //for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
+    //if ((*it)->node != dad)	{
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        getUnorderedTaxaName(taxname, (*it)->node, node);
+    }
+}
+
 void MTree::getTaxaID(vector<int> &taxa, Node *node, Node *dad) {
     if (!node) node = root;
     if (node->isLeaf()) {
@@ -997,11 +1152,16 @@ bool MTree::containsSplits(SplitGraph& splits) {
 }
 
 Split* MTree::getSplit(Node* node1, Node* node2) {
-	Split* sp = new Split(leafNum);
-	getTaxa(*sp, node1, node2);
-	if (sp->shouldInvert())
-		sp->invert();
-	return sp;
+    Neighbor* node12 = node1->findNeighbor(node2);
+    return node12->split;
+}
+
+Split* MTree::_getSplit(Node* node1, Node* node2) {
+    Split* sp = new Split(leafNum);
+    getTaxa(*sp, node1, node2);
+    if (sp->shouldInvert())
+        sp->invert();
+    return sp;
 }
 
 void MTree::convertSplits(SplitGraph &sg, Split *resp, NodeVector *nodes, Node *node, Node *dad) {
@@ -1019,8 +1179,8 @@ void MTree::convertSplits(SplitGraph &sg, Split *resp, NodeVector *nodes, Node *
             sp->invert();
 		 /* ignore nodes with degree of 2 because such split will be added before */
         if (node->degree() != 2) {
-		  sg.push_back(sp);
-          if (nodes) nodes->push_back((*it)->node);
+        	sg.push_back(sp);
+        	if (nodes) nodes->push_back((*it)->node);
         }
         has_child = true;
     }
@@ -2049,6 +2209,81 @@ void MTree::removeTaxa(StrVector &taxa_names) {
 	initializeTree();
 }
 
+void MTree::getSplits(SplitGraph &splits, Node* node, Node* dad) {
+   if (!node) {
+       node = root;
+   }
+   FOR_NEIGHBOR_IT(node, dad, it) {
+           getSplits(splits, (*it)->node, node);
+           Split* mySplit = new Split(*((*it)->split));
+           if (mySplit->shouldInvert())
+               mySplit->invert();
+           splits.push_back(mySplit);
+       }
+}
+
+void MTree::buildNodeSplit(Split *resp, Node *node, Node *dad) {
+    if (!node) {
+        node = root;
+        // The neighbor that represents root
+        Neighbor* rootNei = root->neighbors[0]->node->findNeighbor(root);
+        if (rootNei->split == NULL) {
+            rootNei->split = new Split(leafNum);
+        } else {
+            delete rootNei->split;
+            rootNei->split = new Split(leafNum);
+        }
+        resp = rootNei->split;
+    }
+    bool has_child = false;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+            if ((*it)->split == NULL) {
+                (*it)->split = new Split(leafNum);
+            } else {
+                delete (*it)->split;
+                (*it)->split = new Split(leafNum);
+            }
+            buildNodeSplit((*it)->split, (*it)->node, node);
+            //(*it)->split->report(cout);
+            *resp += *((*it)->split);
+            has_child = true;
+        }
+
+    if (dad != NULL) {
+        Neighbor* dadNei = node->findNeighbor(dad);
+        dadNei->split = new Split(*resp);
+        dadNei->split->invert();
+    }
+
+    if (!has_child) {
+        resp->addTaxon(node->id);
+    }
+}
+
+void MTree::initializeSplitMap(Split *resp, Node *node, Node *dad) {
+    if (!node) node = root;
+    if (!resp) {
+        resp = new Split(leafNum);
+    }
+    bool has_child = false;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+            Split *sp = new Split(leafNum);
+            initializeSplitMap(sp, (*it)->node, node);
+            *resp += *sp;
+            if (sp->shouldInvert())
+                sp->invert();
+            /* ignore nodes with degree of 2 because such split will be added before */
+            if (node->degree() != 2) {
+                Branch curBranch((*it)->node, node);
+                splitBranchMap.insert(make_pair(sp, curBranch));
+            }
+            has_child = true;
+        }
+    if (!has_child) {
+        resp->addTaxon(node->id);
+    }
+}
+
 Node *MTree::findFarthestLeaf(Node *node, Node *dad) {
     if (!node) 
         node = root;
@@ -2069,30 +2304,6 @@ Node *MTree::findFarthestLeaf(Node *node, Node *dad) {
     return res;
 }
 
-//void MTree::sortNeighborBySubtreeSize(Node *node, Node *dad) {
-//    if (dad && node->isLeaf()) {
-//        node->height = 0.0;
-//        return;
-//    }
-//    
-//    node->height = 0.0;
-//    FOR_NEIGHBOR_DECLARE(node, dad, it) {
-//        sortNeighborBySubtreeSize((*it)->node, node);
-//        if (node->height < (*it)->node->height+1)
-//            node->height = (*it)->node->height+1;
-//    }
-//    
-//    // sort neighbors in ascending order of tree height
-//    FOR_NEIGHBOR(node, dad, it)
-//        for (NeighborVec::iterator it2 = it+1; it2 != node->neighbors.end(); it2++)
-//            if ((*it)->node != dad && (*it)->node->height > (*it2)->node->height) {
-//                Neighbor *nei;
-//                nei = *it;
-//                *it = *it2;
-//                *it2 = nei;
-//            }
-//}
-
 void MTree::getPreOrderBranches(NodeVector &nodes, NodeVector &nodes2, Node *node, Node *dad) {
     if (dad) {
         nodes.push_back(node);
diff --git a/mtree.h b/mtree.h
index 44e7c42..701fc6e 100644
--- a/mtree.h
+++ b/mtree.h
@@ -27,6 +27,7 @@
 #include <sstream>
 #include "hashsplitset.h"
 #include "splitset.h"
+//#include "candidateset.h"
 
 const char ROOT_NAME[] = "_root";
 
@@ -57,6 +58,23 @@ public:
     MTree(MTree &tree);
 
     /**
+     *      Constructor, read tree from string.
+     *      Taxa IDs are assigned according to the order in taxaNames
+     */
+    MTree(string& treeString, vector<string>& taxaNames, bool isRooted);
+
+    /**
+     *  Read tree from string assuming that the taxa names are numeric numbers
+     *  Leaf IDs are then assigned according to the number in the taxa names
+     */
+    MTree(string& treeString, bool isRooted);
+
+    /**
+     *   Assign taxa IDs according to the order in taxaNames
+     */
+    void assignIDs(vector<string>& taxaNames);
+
+    /**
             constructor
      */
     MTree();
@@ -77,6 +95,21 @@ public:
     Node* copyTree(MTree *tree, string &taxa_set, double &len, Node *node = NULL, Node *dad = NULL);
 
     /**
+            In case of mulfurcating tree, extract a bifurcating subtree by randomly removing multifurcation
+            If the tree is bifurcating, nothing change
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+     */
+    void extractBifurcatingSubTree(Node *node = NULL, Node *dad = NULL);
+
+
+    /**
+            In case of mulfurcating tree, randomly resolve multifurcating node to obtain strictly bifurcating tree
+            If the tree is bifurcating, nothing change
+     */
+    void resolveMultifurcation();
+
+    /**
             initialize the tree from a NEWICK tree file
             @param userTreeFile the name of the user tree
             @param is_rooted (IN/OUT) true if tree is rooted
@@ -153,9 +186,6 @@ public:
      */
     void printTree(ostream & out, int brtype = WT_BR_LEN);
 
-
-//    string getTreeString();
-
     /**
             print the tree to the output file in newick format
             @param out the output file.
@@ -254,6 +284,13 @@ public:
     virtual void readTree(istream &in, bool &is_rooted);
 
     /**
+            read the tree from a newick string
+            @param tree_string the tree string.
+            @param is_rooted (IN/OUT) true if tree is rooted
+     */
+    //virtual void readTreeString(string tree_string, bool is_rooted);
+
+    /**
             parse the tree from the input file in newick format
             @param infile the input file
             @param ch (IN/OUT) current char
@@ -358,11 +395,20 @@ public:
             get the descending taxa names below the node
             @param node the starting node, NULL to start from the root
             @param dad dad of the node, used to direct the search
-            @param taxname (OUT) taxa name
+            @param[out] taxname taxa name, with size equal leafNum and ordered with taxon ID
      */
     void getTaxaName(vector<string> &taxname, Node *node = NULL, Node *dad = NULL);
 
     /**
+            get the descending taxa names below the node. different from getTaxaName() in that the
+            taxa are not ordered by ID at all!
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @param[out] taxname taxa name
+     */
+    void getUnorderedTaxaName(vector<string> &taxname, Node *node, Node *dad);
+
+    /**
             get the descending internal nodes below \a node
             @param node the starting node, NULL to start from the root
             @param dad dad of the node, used to direct the search
@@ -378,7 +424,7 @@ public:
             @param nodes2 (OUT) vector of the other end node of branch
             @param excludeSplits do not collect branches in here
      */
-    void getAllInnerBranches(vector<Node*> &nodes, vector<Node*> &nodes2, SplitGraph* excludeSplits = NULL, Node *node = NULL, Node *dad = NULL);
+    void generateNNIBraches(vector<Node*> &nodes, vector<Node*> &nodes2, SplitGraph* excludeSplits = NULL, Node *node = NULL, Node *dad = NULL);
 
     /**
             get all descending branches below the node
@@ -390,19 +436,21 @@ public:
     void getBranches(NodeVector &nodes, NodeVector &nodes2, Node *node = NULL, Node *dad = NULL);
 
     /**
+            get all inner branches below the node
+            @param branches the branches are stored here
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+     */
+    void getInnerBranches(Branches& branches, Node *node = NULL, Node *dad = NULL);
+
+    /**
      *      get all descending internal branches below \a node and \a dad up to depth \a depth
      *      @param[in] depth collect all internal branches up to distance \a depth from the current branch
      *      @param[in] node one of the 2 nodes of the current branches
      *      @param[in] dad one of the 2 nodes of the current branches
-     *      @param[out] nodes1 contains one ends of the collected branches
-     *      @param[out] nodes2 contains the other ends of the collected branches
+     *      @param[out] surrBranches the resulting branches
      */
-    void getInnerBranches(NodeVector& nodes1, NodeVector& nodes2, int depth, Node *node, Node *dad);
-
-    /**
-     *  @brief check whether branch (node1, node2) exist in the branch vector (nodes1, node2)
-     */
-    bool branchExist(Node* node1, Node* node2, NodeVector& nodes1, NodeVector& nodes2);
+    void getSurroundingInnerBranches(Node *node, Node *dad, int depth, Branches &surrBranches);
 
     /**
      * @brief: check if the branch is internal
@@ -513,14 +561,44 @@ public:
     void convertSplits(SplitGraph &sg, Split *resp, NodeVector *nodes = NULL, Node *node = NULL, Node *dad = NULL);
 
     /**
+     * Initialize the hash stable splitBranchMap which contain mapping from split to branch
+     * @param resp (internal) set of taxa below node
+     * @param node the starting node, NULL to start from the root
+     * @param dad dad of the node, used to direct the search
+     */
+    void initializeSplitMap(Split *resp = NULL, Node *node = NULL, Node *dad = NULL);
+
+    /**
+    *   Generate a split for each neighbor node
+    */
+    void buildNodeSplit(Split *resp = NULL, Node *node = NULL, Node *dad = NULL);
+
+    /**
+     *  Get split graph based on split stored in nodes
+     */
+    void getSplits(SplitGraph &splits, Node* node = NULL, Node* dad = NULL);
+
+    /**
+    *   Update the Split-Branch map with the new split defined by a branch
+    *   @param node1 one end of the branch
+    *   @param node2 the other end
+    */
+    //void updateSplitMap(Node* node1, Node* node2);
+
+    /**
      * 		Generate a split defined by branch node1-node2
      * 		@param node1 one end of the branch
      * 		@param node2 one end of the branch
-     * 		@return a pointer to the split (the new split is allocated dynamically)
+     * 		@return the split
      */
     Split* getSplit(Node* node1, Node* node2);
 
     /**
+     *  Slow version of getSplit, which traverses the tree to get the splits
+     */
+    Split* _getSplit(Node* node1, Node* node2);
+
+    /**
      *  Check whehter the tree contains all splits in \a splits
      *  @param splits list of splits to check
      *  @return true or false
@@ -636,7 +714,7 @@ public:
     /**
             number of leaves
      */
-    int leafNum;
+    unsigned int leafNum;
 
     /**
             total number of nodes in the tree
@@ -667,6 +745,11 @@ public:
     double len_scale;
 
     /**
+    *   Pointer to the global params
+    */
+    Params* params;
+
+    /**
             release the nemory.
             @param node the starting node, NULL to start from the root
             @param dad dad of the node, used to direct the search
@@ -675,7 +758,17 @@ public:
 
     void setExtendedFigChar();
 
+    /** set pointer of params variable */
+    virtual void setParams(Params* params) {
+        this->params = params;
+    };
+
 protected:
+    /**
+     * 		Hash stable mapping a split into branch.
+     * 		This data structure is generated when genSplitMap() is called.
+     */
+    unordered_map<Split*, Branch, hashfunc_Split> splitBranchMap;
 
     /**
             line number of the input file, used to output errors in input file
diff --git a/mtreeset.cpp b/mtreeset.cpp
index fd997da..f02f06c 100644
--- a/mtreeset.cpp
+++ b/mtreeset.cpp
@@ -112,6 +112,7 @@ void MTreeSet::init(StrVector &treels, bool &is_rooted) {
 	//for (i = 0; i < trees_id.size(); i++) ok_trees[trees_id[i]] = 1;
 
 	for (StrVector::iterator it = treels.begin(); it != treels.end(); it++)
+    if (!it->empty())
 	{
 		count++;
 		MTree *tree = newTree();
diff --git a/ngs.cpp b/ngs.cpp
index 6b22e1c..7fb98c9 100644
--- a/ngs.cpp
+++ b/ngs.cpp
@@ -410,7 +410,7 @@ NGSTree::NGSTree(Params &params, NGSAlignment *alignment) {
     model_factory = NULL;
     optimize_by_newton = params.optimize_by_newton;
     //tree.sse = params.SSE;
-    setLikelihoodKernel(LK_EIGEN);
+    setLikelihoodKernel(LK_EIGEN, params.num_threads);
 }
 
 double NGSTree::computeLikelihood(double *pattern_lh) {
diff --git a/node.cpp b/node.cpp
index 65cd70f..8a89887 100644
--- a/node.cpp
+++ b/node.cpp
@@ -1,6 +1,8 @@
 /***************************************************************************
- *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
- *   minh.bui at univie.ac.at   *
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
diff --git a/node.h b/node.h
index b4e6500..fbb6604 100644
--- a/node.h
+++ b/node.h
@@ -1,6 +1,8 @@
 /***************************************************************************
- *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
- *   minh.bui at univie.ac.at   *
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
@@ -33,8 +35,8 @@
 //#include <time.h>
 #include <math.h>
 #include "ncl/ncl.h"
-
 #include "tools.h"
+#include "split.h"
 
 using namespace std;
 
@@ -46,6 +48,7 @@ class Node;
     Neighbor list of a node in the tree
  */
 class Neighbor {
+
 public:
 
     /**
@@ -64,6 +67,11 @@ public:
     int id;
 
     /**
+    *   The set of taxa underneath the neighbor
+    */
+    Split* split;
+
+    /**
         construct class with a node and length
         @param anode the other end of the branch
         @param alength length of branch
@@ -72,6 +80,7 @@ public:
         node = anode;
         length = alength;
         id = -1;
+        split = NULL;
     }
 
     /**
@@ -84,6 +93,7 @@ public:
         node = anode;
         length = alength;
         id = aid;
+        split = NULL;
     }
 
     /**
@@ -94,6 +104,7 @@ public:
         node = nei->node;
         length = nei->length;
         id = nei->id;
+        split = NULL;
     }
 
     /**
@@ -113,6 +124,9 @@ typedef vector<Neighbor*> NeighborVec;
  */
 typedef vector<Node*> NodeVector;
 
+typedef pair<Node*, Node*> Branch;
+typedef map<int, Branch> Branches;
+
 /*--------------------------------------------------------------*/
 /*--------------------------------------------------------------*/
 
diff --git a/optimization.cpp b/optimization.cpp
index 92a8093..fc64af9 100644
--- a/optimization.cpp
+++ b/optimization.cpp
@@ -38,6 +38,7 @@ void fixBound(double x[], double lower[], double upper[], int n);
 					psum[n]=sum;}
 
 
+/*
 #define IA 16807
 #define IM 2147483647
 #define AM (1.0/IM)
@@ -84,7 +85,7 @@ double ran1(long *idum) {
 #undef NDIV
 #undef EPS
 #undef RNMX
-
+*/
 
 long idum = 123456;
 double tt;
@@ -456,7 +457,7 @@ double Optimization::minimizeNewton(double x1, double xguess, double x2, double
 		}
 		if (f < 0.0)
 			xl=rts;
-		else
+		else if (f > 0.0)
 			xh=rts;
 	}
 	nrerror("Maximum number of iterations exceeded in minimizeNewton");
@@ -477,9 +478,9 @@ double Optimization::minimizeNewton(double x1, double xguess, double x2, double
 
 #define ALF 1.0e-4
 #define TOLX 1.0e-7
-static double maxarg1,maxarg2;
-#define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ?\
-        (maxarg1) : (maxarg2))
+//static double maxarg1,maxarg2;
+//#define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ?\
+//        (maxarg1) : (maxarg2))
 
 void Optimization::lnsrch(int n, double xold[], double fold, double g[], double p[], double x[],
                    double *f, double stpmax, int *check, double lower[], double upper[]) {
@@ -496,7 +497,7 @@ void Optimization::lnsrch(int n, double xold[], double fold, double g[], double
 		slope += g[i]*p[i];
 	test=0.0;
 	for (i=1;i<=n;i++) {
-		temp=fabs(p[i])/FMAX(fabs(xold[i]),1.0);
+		temp=fabs(p[i])/max(fabs(xold[i]),1.0);
 		if (temp > test) test=temp;
 	}
 	alamin=TOLX/test;
@@ -546,7 +547,7 @@ void Optimization::lnsrch(int n, double xold[], double fold, double g[], double
 		alam2=alam;
 		f2 = *f;
 		fold2=fold;
-		alam=FMAX(tmplam,0.1*alam);
+		alam=max(tmplam,0.1*alam);
 		first_time = false;
 	}
 }
@@ -608,8 +609,8 @@ double Optimization::minimizeMultiDimen(double guess[], int ndim, double lower[]
 
 
 #define ITMAX 200
-static double sqrarg;
-#define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg)
+//static double sqrarg;
+#define SQR(a) ((a)*(a))
 #define EPS 3.0e-8
 #define TOLX (4*EPS)
 #define STPMX 100.0
@@ -641,7 +642,7 @@ void Optimization::dfpmin(double p[], int n, double lower[], double upper[], dou
 	//checkBound(p, xi, lower, upper, n);
 	//checkDirection(p, xi);
 
-	stpmax=STPMX*FMAX(sqrt(sum),(double)n);
+	stpmax=STPMX*max(sqrt(sum),(double)n);
 	for (its=1;its<=ITMAX;its++) {
 		*iter=its;
 		lnsrch(n,p,fp,g,xi,pnew,fret,stpmax,&check, lower, upper);
@@ -652,7 +653,7 @@ void Optimization::dfpmin(double p[], int n, double lower[], double upper[], dou
 		}
 		test=0.0;
 		for (i=1;i<=n;i++) {
-			temp=fabs(xi[i])/FMAX(fabs(p[i]),1.0);
+			temp=fabs(xi[i])/max(fabs(p[i]),1.0);
 			if (temp > test) test=temp;
 		}
 		if (test < TOLX) {
@@ -662,9 +663,9 @@ void Optimization::dfpmin(double p[], int n, double lower[], double upper[], dou
 		for (i=1;i<=n;i++) dg[i]=g[i];
 		derivativeFunk(p,g);
 		test=0.0;
-		den=FMAX(fabs(*fret),1.0); // fix bug found by Tung, as also suggested by NR author
+		den=max(fabs(*fret),1.0); // fix bug found by Tung, as also suggested by NR author
 		for (i=1;i<=n;i++) {
-			temp=fabs(g[i])*FMAX(fabs(p[i]),1.0)/den;
+			temp=fabs(g[i])*max(fabs(p[i]),1.0)/den;
 			if (temp > test) test=temp;
 		}
 		if (test < gtol) {
@@ -703,7 +704,7 @@ void Optimization::dfpmin(double p[], int n, double lower[], double upper[], dou
 		//checkDirection(p, xi);
 		//if (*iter > 200) cout << "iteration=" << *iter << endl;
 	}
-	// BQM: TODO disable this message!
+	// BQM: disable this message!
 	//nrerror("too many iterations in dfpmin");
 	FREEALL
 }
diff --git a/pda.cpp b/pda.cpp
index 13df75c..639e22f 100644
--- a/pda.cpp
+++ b/pda.cpp
@@ -65,7 +65,12 @@
 #include "timeutil.h"
 //#include <unistd.h>
 #include <stdlib.h>
-#include "vectorclass/vectorclass.h"
+#include "vectorclass/instrset.h"
+
+#include "MPIHelper.h"
+#ifdef _IQTREE_MPI
+#include <mpi.h>
+#endif
 
 #ifdef _OPENMP
 	#include <omp.h>
@@ -200,6 +205,9 @@ inline void separator(ostream &out, int type = 0) {
 void printCopyright(ostream &out) {
 #ifdef IQ_TREE
  	out << "IQ-TREE";
+    #ifdef _IQTREE_MPI
+    out << " MPI";
+    #endif
 	#ifdef _OPENMP
 	out << " multicore";
 	#endif
@@ -227,7 +235,7 @@ void printCopyright(ostream &out) {
 #endif
 
 #ifdef IQ_TREE
-	out << endl << "Copyright (c) 2011-2015 Nguyen Lam Tung, Olga Chernomor, Arndt von Haeseler and Bui Quang Minh." << endl << endl;
+	out << endl << "Copyright (c) 2011-2016 Nguyen Lam Tung, Olga Chernomor, Arndt von Haeseler and Bui Quang Minh." << endl << endl;
 #else
 	out << endl << "Copyright (c) 2006-2014 Olga Chernomor, Arndt von Haeseler and Bui Quang Minh." << endl << endl;
 #endif
@@ -1536,7 +1544,7 @@ void branchStats(Params &params){
 	
 	/***** Following added by BQM to print internal branch lengths */
 	NodeVector nodes1, nodes2;
-	mytree.getAllInnerBranches(nodes1, nodes2);
+	mytree.generateNNIBraches(nodes1, nodes2);
 	output = params.out_prefix;
 	output += ".inlen";
 	try {
@@ -1727,7 +1735,7 @@ protected:
 };
 
 outstreambuf* outstreambuf::open( const char* name, ios::openmode mode) {
-    if (!(Params::getInstance().suppress_output_flags & OUT_LOG)) {
+    if (!(Params::getInstance().suppress_output_flags & OUT_LOG) && MPIHelper::getInstance().isMaster()) {
         fout.open(name, mode);
         if (!fout.is_open()) {
             cout << "Could not open " << name << " for logging" << endl;
@@ -1737,7 +1745,6 @@ outstreambuf* outstreambuf::open( const char* name, ios::openmode mode) {
     }
 	cout_buf = cout.rdbuf();
 	cout.rdbuf(this);
-	cerr.rdbuf(this);
     return this;
 }
 
@@ -1752,18 +1759,22 @@ outstreambuf* outstreambuf::close() {
 }
 
 int outstreambuf::overflow( int c) { // used for output buffer only
-	if (verbose_mode >= VB_MIN)
+	if ((verbose_mode >= VB_MIN && MPIHelper::getInstance().isMaster()) || verbose_mode >= VB_MED)
 		if (cout_buf->sputc(c) == EOF) return EOF;
     if (Params::getInstance().suppress_output_flags & OUT_LOG)
         return c;
+    if (!MPIHelper::getInstance().isMaster())
+        return c;
 	if (fout_buf->sputc(c) == EOF) return EOF;
 	return c;
 }
 
+
+
 int outstreambuf::sync() { // used for output buffer only
-	if (verbose_mode >= VB_MIN)
+	if ((verbose_mode >= VB_MIN && MPIHelper::getInstance().isMaster()) || verbose_mode >= VB_MED)
 		cout_buf->pubsync();
-    if (Params::getInstance().suppress_output_flags & OUT_LOG)
+    if ((Params::getInstance().suppress_output_flags & OUT_LOG) || !MPIHelper::getInstance().isMaster())
         return 0;        
 	return fout_buf->pubsync();
 }
@@ -1773,6 +1784,7 @@ public:
     void init(streambuf *fout_buf) {
         this->fout_buf = fout_buf;
         cerr_buf = cerr.rdbuf();
+        cerr.rdbuf(this);
     }
     
     ~errstreambuf() {
@@ -1785,7 +1797,7 @@ protected:
     
     virtual int overflow( int c = EOF) {
         if (cerr_buf->sputc(c) == EOF) return EOF;
-        if (Params::getInstance().suppress_output_flags & OUT_LOG)
+        if ((Params::getInstance().suppress_output_flags & OUT_LOG))
             return c;
         if (fout_buf->sputc(c) == EOF) return EOF;
         return c;
@@ -1801,13 +1813,14 @@ protected:
 
 
 
-
+/*********************************************************************************
+ * GLOBAL VARIABLES
+ *********************************************************************************/
 outstreambuf _out_buf;
 errstreambuf _err_buf;
 string _log_file;
 int _exit_wait_optn = FALSE;
 
-
 extern "C" void startLogFile(bool append_log) {
     if (append_log)
         _out_buf.open(_log_file.c_str(), ios::app);
@@ -1848,6 +1861,9 @@ extern "C" void funcAbort(int signal_number)
 		case SIGFPE:  cerr << "ERRONEOUS NUMERIC"; break;
 		case SIGILL:  cerr << "ILLEGAL INSTRUCTION"; break;
 		case SIGSEGV: cerr << "SEGMENTATION FAULT"; break;
+#if !defined WIN32 && !defined _WIN32 && !defined __WIN32__
+		case SIGBUS: cerr << "BUS ERROR"; break;
+#endif
 	}
     cerr << endl;
 	cerr << "*** For bug report please send to developers:" << endl << "***    Log file: " << _log_file;
@@ -2179,57 +2195,69 @@ Instruction set ID reported by vectorclass::instrset_detect
 */
 int instruction_set;
 
-int main(int argc, char *argv[])
-{
+int main(int argc, char *argv[]) {
+#ifdef _IQTREE_MPI
+	double time_initial, time_current;
+	int n_tasks, task_id;
+	if (MPI_Init(&argc, &argv) != MPI_SUCCESS) {
+		outError("MPI initialization failed!");
+	}
+	MPI_Comm_size(MPI_COMM_WORLD, &n_tasks);
+	MPI_Comm_rank(MPI_COMM_WORLD, &task_id);
+	MPIHelper::getInstance().setNumProcesses(n_tasks);
+	MPIHelper::getInstance().setProcessID(task_id);
+	MPIHelper::getInstance().setNumTreeReceived(0);
+	MPIHelper::getInstance().setNumTreeSent(0);
+    MPIHelper::getInstance().setNumNNISearch(0);
+#endif
 
 	/*************************/
 	{ /* local scope */
-		int found=FALSE;              /* "click" found in cmd name? */
+		int found = FALSE;              /* "click" found in cmd name? */
 		int n, dummyint;
 		char *tmpstr;
-		int     intargc; 
-		char  **intargv; 
-		intargc = 0; 
-		intargv = NULL; 
-		
-		for (n = strlen(argv[0]) - 5; 
-		    (n >= 0) && !found && (argv[0][n] != '/')
-		             && (argv[0][n] != '\\'); n--) {
+		int intargc;
+		char **intargv;
+		intargc = 0;
+		intargv = NULL;
+
+		for (n = strlen(argv[0]) - 5;
+			 (n >= 0) && !found && (argv[0][n] != '/')
+			 && (argv[0][n] != '\\'); n--) {
 
 			tmpstr = &(argv[0][n]);
 			dummyint = 0;
-			(void)sscanf(tmpstr, "click%n", &dummyint);
+			(void) sscanf(tmpstr, "click%n", &dummyint);
 			if (dummyint == 5) found = TRUE;
 			else {
 				dummyint = 0;
-				(void)sscanf(tmpstr, "CLICK%n", &dummyint);
+				(void) sscanf(tmpstr, "CLICK%n", &dummyint);
 				if (dummyint == 5) found = TRUE;
 				else {
 					dummyint = 0;
-					(void)sscanf(tmpstr, "Click%n", &dummyint);
+					(void) sscanf(tmpstr, "Click%n", &dummyint);
 					if (dummyint == 5) found = TRUE;
 				}
 			}
 		}
-		if(found) _exit_wait_optn = TRUE;
+		if (found) _exit_wait_optn = TRUE;
 
 		if (_exit_wait_optn) { // get commandline parameters from keyboard
-			getintargv(&intargc, &intargv); 
+			getintargv(&intargc, &intargv);
 			fprintf(stdout, "\n\n");
-			if(intargc > 1) { // if there were option entered, use them as argc/argv
-				argc = intargc; 
-				argv = intargv; 
-			} 
+			if (intargc > 1) { // if there were option entered, use them as argc/argv
+				argc = intargc;
+				argv = intargv;
+			}
 		}
 	} /* local scope */
 	/*************************/
 
-	//Params params;
 	parseArg(argc, argv, Params::getInstance());
 
     // 2015-12-05
     Checkpoint *checkpoint = new Checkpoint;
-    string filename = (string)Params::getInstance().out_prefix + ".ckp.gz";
+    string filename = (string)Params::getInstance().out_prefix +".ckp.gz";
     checkpoint->setFileName(filename);
     
     bool append_log = false;
@@ -2255,27 +2283,53 @@ int main(int argc, char *argv[])
         }
     }
 
-    _log_file = Params::getInstance().out_prefix;
-    _log_file += ".log";
-    startLogFile(append_log);
+    // after loading, workers are not allowed to write checkpoint anymore
+    if (MPIHelper::getInstance().isWorker())
+        checkpoint->setFileName("");
+
+	_log_file = Params::getInstance().out_prefix;
+	_log_file += ".log";
+	startLogFile(append_log);
+	time_t start_time;
 
     if (append_log) {
         cout << endl << "******************************************************"
              << endl << "CHECKPOINT: Resuming analysis from " << filename << endl << endl;
     }
+#ifdef _IQTREE_MPI
+	cout << "************************************************" << endl;
+	cout << "* START TREE SEARCH USING MPI WITH " << MPIHelper::getInstance().getNumProcesses() << " PROCESSES *" << endl;
+	cout << "************************************************" << endl;
+	unsigned int rndSeed;
+	if (MPIHelper::getInstance().isMaster()) {
+		rndSeed = Params::getInstance().ran_seed;
+		cout << "Random seed of master = " << rndSeed << endl;
+	}
+	// Broadcast random seed
+	MPI_Bcast(&rndSeed, 1, MPI_INT, PROC_MASTER, MPI_COMM_WORLD);
+	if (MPIHelper::getInstance().isWorker()) {
+//		Params::getInstance().ran_seed = rndSeed + task_id * 100000;
+		Params::getInstance().ran_seed = rndSeed;
+//		printf("Process %d: random_seed = %d\n", task_id, Params::getInstance().ran_seed);
+	}
+#endif
 
 	atexit(funcExit);
 	signal(SIGABRT, &funcAbort);
 	signal(SIGFPE, &funcAbort);
 	signal(SIGILL, &funcAbort);
 	signal(SIGSEGV, &funcAbort);
+#if !defined WIN32 && !defined _WIN32 && !defined __WIN32__
+	signal(SIGBUS, &funcAbort);
+#endif
 	printCopyright(cout);
+
 	/*
-	double x=1e-100;
-	double y=1e-101;
-	if (x > y) cout << "ok!" << endl;
-	else cout << "shit!" << endl;
-	*/
+    double x=1e-100;
+    double y=1e-101;
+    if (x > y) cout << "ok!" << endl;
+    else cout << "shit!" << endl;
+    */
 	//FILE *pfile = popen("hostname","r");
 	char hostname[100];
 #if defined WIN32 || defined _WIN32 || defined __WIN32__
@@ -2295,10 +2349,10 @@ int main(int argc, char *argv[])
 #endif
 	if (instruction_set < 3) outError("Your CPU does not support SSE3!");
 	bool has_fma3 = (instruction_set >= 7) && hasFMA3();
-	bool has_fma4 = (instruction_set >= 7) && hasFMA4();
+//	bool has_fma4 = (instruction_set >= 7) && hasFMA4();
 
 #ifdef __FMA__
-	bool has_fma =  has_fma3 || has_fma4;
+	bool has_fma =  has_fma3;
 	if (!has_fma) {
 		outError("Your CPU does not support FMA instruction, quiting now...");
 	}
@@ -2306,16 +2360,19 @@ int main(int argc, char *argv[])
 
 	cout << "Host:    " << hostname << " (";
 	switch (instruction_set) {
+	case 0: cout << "80386, "; break;
+	case 1: cout << "SSE, "; break;
+	case 2: cout << "SSE2, "; break;
 	case 3: cout << "SSE3, "; break;
 	case 4: cout << "SSSE3, "; break;
 	case 5: cout << "SSE4.1, "; break;
 	case 6: cout << "SSE4.2, "; break;
 	case 7: cout << "AVX, "; break;
 	case 8: cout << "AVX2, "; break;
-	default: cout << "AVX512F, "; break;
+	default: cout << "AVX512, "; break;
 	}
 	if (has_fma3) cout << "FMA3, ";
-	if (has_fma4) cout << "FMA4, ";
+//	if (has_fma4) cout << "FMA4, ";
 //#if defined __APPLE__ || defined __MACH__
 	cout << (int)(((getMemorySize()/1024.0)/1024)/1024) << " GB RAM)" << endl;
 //#else
@@ -2330,16 +2387,20 @@ int main(int argc, char *argv[])
 
     checkpoint->get("iqtree.seed", Params::getInstance().ran_seed);
 	cout << "Seed:    " << Params::getInstance().ran_seed <<  " ";
-	init_random(Params::getInstance().ran_seed, true);
+	init_random(Params::getInstance().ran_seed + MPIHelper::getInstance().getProcessID(), true);
 
-	time_t start_time;
 	time(&start_time);
 	cout << "Time:    " << ctime(&start_time);
 
-	if (Params::getInstance().lk_no_avx)
+	if (Params::getInstance().lk_no_avx == 1)
 		instruction_set = min(instruction_set, 6);
 
 	cout << "Kernel:  ";
+
+    if (Params::getInstance().lk_safe_scaling) {
+        cout << "Safe ";
+    }
+
 	if (Params::getInstance().pll) {
 #ifdef __AVX__
 		cout << "PLL-AVX";
@@ -2347,10 +2408,13 @@ int main(int argc, char *argv[])
 		cout << "PLL-SSE3";
 #endif
 	} else {
+        bool has_fma = (has_fma3) && (instruction_set >= 7) && Params::getInstance().lk_no_avx != 2;
 		switch (Params::getInstance().SSE) {
 		case LK_EIGEN: cout << "No SSE"; break;
 		case LK_EIGEN_SSE:
-			if (instruction_set >= 7) {
+            if (has_fma) {
+                cout << "AVX+FMA";
+            } else if (instruction_set >= 7) {
 				cout << "AVX";
 			} else {
 				cout << "SSE3";
@@ -2363,18 +2427,23 @@ int main(int argc, char *argv[])
 		}
 	}
 
-
-
 #ifdef _OPENMP
-	if (Params::getInstance().num_threads == 0) {
+	if (Params::getInstance().num_threads < 0) {
 		cout << endl << endl;
-		outError("Please specify the number of cores to use (-nt option)!");
+		outError("Please specify number of cores via -nt option. Use '-nt AUTO' to automatically determine the best number of cores");
 	}
-	if (Params::getInstance().num_threads) omp_set_num_threads(Params::getInstance().num_threads);
+	if (Params::getInstance().num_threads >= 1) {
+        omp_set_num_threads(Params::getInstance().num_threads);
+        Params::getInstance().num_threads = omp_get_max_threads();
+    }
 //	int max_threads = omp_get_max_threads();
-	Params::getInstance().num_threads = omp_get_max_threads();
 	int max_procs = countPhysicalCPUCores();
-	cout << " - " << Params::getInstance().num_threads  << " threads (" << max_procs << " CPU cores detected)";
+	cout << " - ";
+    if (Params::getInstance().num_threads > 0)
+        cout << Params::getInstance().num_threads  << " threads";
+    else
+        cout << "auto-detect";
+    cout << "(" << max_procs << " CPU cores detected)";
 	if (Params::getInstance().num_threads  > max_procs) {
 		cout << endl;
 		outError("You have specified more threads than CPU cores available");
@@ -2386,10 +2455,12 @@ int main(int argc, char *argv[])
 		outError("Number of threads must be 1 for sequential version.");
 	}
     int num_procs = countPhysicalCPUCores();
+#ifndef _IQTREE_MPI
     if (num_procs > 1) {
         cout << endl << endl << "NOTE: Consider using the multicore version because your CPU has " << num_procs << " cores!";
     }
 #endif
+#endif
 	//cout << "sizeof(int)=" << sizeof(int) << endl;
 	cout << endl << endl;
 
@@ -2434,6 +2505,13 @@ int main(int argc, char *argv[])
     CKP_SAVE(version);
     checkpoint->endStruct();
 
+    if (MPIHelper::getInstance().getNumProcesses() > 1) {
+        if (Params::getInstance().aln_file || Params::getInstance().partition_file) {
+            runPhyloAnalysis(Params::getInstance(), checkpoint);
+        } else {
+            outError("Please use one MPI process! The feature you wanted does not need parallelization.");
+        }
+    } else
 	// call the main function
 	if (Params::getInstance().tree_gen != NONE) {
 		generateRandomTree(Params::getInstance());
@@ -2529,10 +2607,14 @@ int main(int argc, char *argv[])
 		}
 	}
 
-	delete checkpoint;
 	time(&start_time);
 	cout << "Date and Time: " << ctime(&start_time);
+	delete checkpoint;
 
 	finish_random();
+    
+#ifdef _IQTREE_MPI
+    MPI_Finalize();
+#endif    
 	return EXIT_SUCCESS;
 }
diff --git a/pdtree.cpp b/pdtree.cpp
index bc92ff4..bfd9d3a 100644
--- a/pdtree.cpp
+++ b/pdtree.cpp
@@ -20,7 +20,6 @@
 #include "ncl/ncl.h"
 #include "tools.h"
 #include "pdtree.h"
-#include "msetsblock.h"
 #include "myreader.h"
 
 /*********************************************
@@ -163,7 +162,7 @@ void PDTree::readParams(Params &params) {
 	// now convert the weights
 	LeafMapName lsn;
 	buildLeafMapName(lsn);
-	tax_weight.resize(ntaxa, 0);
+	tax_weight.resize((unsigned long) ntaxa, 0);
 	for (int i = 0; i < tax_name.size(); i++) {
 		LeafMapName::iterator nameit = lsn.find(tax_name[i]);
 		if (nameit == lsn.end())
@@ -224,7 +223,7 @@ void PDTree::computePD(Params &params, vector<PDTaxaSet> &taxa_set, PDRelatedMea
 
 	//sets->Report(cout);
 
-	taxa_set.resize(sets->getNSets());
+	taxa_set.resize((unsigned long) sets->getNSets());
 
 	vector<PDTaxaSet>::iterator it_ts;
 	TaxaSetNameVector::iterator i;
diff --git a/phyloanalysis.cpp b/phyloanalysis.cpp
index 2eaabe6..7017887 100644
--- a/phyloanalysis.cpp
+++ b/phyloanalysis.cpp
@@ -1,6 +1,8 @@
 /***************************************************************************
- *   Copyright (C) 2009 by BUI Quang Minh   *
- *   minh.bui at univie.ac.at   *
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
@@ -52,6 +54,7 @@
 #include "model/modelset.h"
 #include "timeutil.h"
 #include "upperbounds.h"
+#include "MPIHelper.h"
 
 
 void reportReferences(Params &params, ofstream &out, string &original_model) {
@@ -576,9 +579,9 @@ void printOutfilesInfo(Params &params, string &original_model, IQTree &tree) {
             else
                 cout << "  Tree used for model selection: " << params.out_prefix << ".treefile" << endl;
         }
-		if (params.snni && params.write_local_optimal_trees) {
-			cout << "  Locally optimal trees (" << tree.candidateTrees.getNumLocalOptTrees() << "):    " << params.out_prefix << ".suboptimal_trees" << endl;
-		}
+//		if (params.snni && params.write_local_optimal_trees) {
+//			cout << "  Locally optimal trees (" << tree.candidateTrees.getNumLocalOptTrees() << "):    " << params.out_prefix << ".suboptimal_trees" << endl;
+//		}
 	}
 	if (!params.user_file && params.start_tree == STT_BIONJ) {
 		cout << "  BIONJ tree:                    " << params.out_prefix << ".bionj"
@@ -618,14 +621,29 @@ void printOutfilesInfo(Params &params, string &original_model, IQTree &tree) {
 		cout << "  Site log-likelihoods:          " << params.out_prefix << ".sitelh"
 				<< endl;
 
+	if (params.print_partition_lh)
+		cout << "  Partition log-likelihoods:     " << params.out_prefix << ".partlh"
+				<< endl;
+
 	if (params.print_site_prob)
 		cout << "  Site probability per rate/mix: " << params.out_prefix << ".siteprob"
 				<< endl;
 
+    if (params.print_ancestral_sequence) {
+        cout << "  Ancestral state probabilities: " << params.out_prefix << ".ancestralprob" << endl;
+        cout << "  Ancestral sequences:           " << params.out_prefix << ".ancestralseq" << endl;
+    }
+
 	if (params.write_intermediate_trees)
 		cout << "  All intermediate trees:        " << params.out_prefix << ".treels"
 				<< endl;
 
+    if (params.writeDistImdTrees) {
+        tree.intermediateTrees.printTrees(string("ditrees"));
+        cout << "  Distinct intermediate trees:   " << params.out_prefix <<  ".ditrees" << endl;
+        cout << "  Logl of intermediate trees:    " << params.out_prefix <<  ".ditrees_lh" << endl;
+    }
+
 	if (params.gbo_replicates) {
 		cout << endl << "Ultrafast bootstrap approximation results written to:" << endl
 			 << "  Split support values:          " << params.out_prefix << ".splits.nex" << endl
@@ -641,9 +659,6 @@ void printOutfilesInfo(Params &params, string &original_model, IQTree &tree) {
 		if (params.print_tree_lh) {
 		cout << "  Tree log-likelihoods:          " << params.out_prefix << ".treelh" << endl;
 		}
-		if (params.print_site_lh) {
-		cout << "  Site log-likelihoods:          " << params.out_prefix << ".sitelh" << endl;
-		}
 	}
     	if (params.lmap_num_quartets >= 0) {
 		cout << "  Likelihood mapping plot (SVG): " << params.out_prefix << ".lmap.svg" << endl;
@@ -660,7 +675,9 @@ void printOutfilesInfo(Params &params, string &original_model, IQTree &tree) {
 
 void reportPhyloAnalysis(Params &params, string &original_model,
 		IQTree &tree, vector<ModelInfo> &model_info) {
-        
+    if (!MPIHelper::getInstance().isMaster()) {
+        return;
+    }
     if (params.suppress_output_flags & OUT_IQTREE) {
         printOutfilesInfo(params, original_model, tree);
         return;
@@ -890,12 +907,12 @@ void reportPhyloAnalysis(Params &params, string &original_model,
 			tree.setRootNode(params.root);
             
             if (params.gbo_replicates) {
-                if (tree.boot_consense_logl > tree.candidateTrees.getBestScore() + 0.1) {
+                if (tree.boot_consense_logl > tree.getBestScore() + 0.1) {
                     out << endl << "**NOTE**: Consensus tree has higher likelihood than ML tree found! Please use consensus tree below." << endl;
                 }
             }
 
-			reportTree(out, params, tree, tree.candidateTrees.getBestScore(), tree.logl_variance, true);
+			reportTree(out, params, tree, tree.getBestScore(), tree.logl_variance, true);
 
 			if (tree.isSuperTree() && verbose_mode >= VB_MED) {
 				PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
@@ -944,9 +961,7 @@ void reportPhyloAnalysis(Params &params, string &original_model,
 			string con_file = params.out_prefix;
 			con_file += ".contree";
 
-            IntVector rfdist;
-            tree.computeRFDist(con_file.c_str(), rfdist);
-            out << endl << "Robinson-Foulds distance between ML tree and consensus tree: " << rfdist[0] << endl;
+            out << endl << "Robinson-Foulds distance between ML tree and consensus tree: " << params.contree_rfdist << endl;
             
             out << endl << "Branches with bootstrap support >"
 					<< floor(params.split_threshold * 1000) / 10 << "% are kept";
@@ -1301,6 +1316,8 @@ void initializeParams(Params &params, IQTree &iqtree, vector<ModelInfo> &model_i
     bool test_only = params.model_name.find("ONLY") != string::npos;
     /* initialize substitution model */
     if (params.model_name.substr(0, 4) == "TEST") {
+        if (MPIHelper::getInstance().getNumProcesses() > 1)
+            outError("Please use only 1 MPI process! We are currently working on the MPI parallelization of model selection.");
     	// TODO: check if necessary
 //        if (iqtree.isSuperTree())
 //            ((PhyloSuperTree*) &iqtree)->mapTrees();
@@ -1341,7 +1358,7 @@ void initializeParams(Params &params, IQTree &iqtree, vector<ModelInfo> &model_i
         fmodel.precision(4);
         fmodel << fixed;
 
-        params.model_name = testModel(params, &iqtree, model_info, fmodel, models_block, "", true);
+        params.model_name = testModel(params, &iqtree, model_info, fmodel, models_block, params.num_threads, "", true);
         fmodel.close();
         params.startCPUTime = start_cpu_time;
         params.start_real_time = start_real_time;
@@ -1402,8 +1419,8 @@ void pruneTaxa(Params &params, IQTree &iqtree, double *pattern_lh, NodeVector &p
 		iqtree.clearAllPartialLH();
 		iqtree.setCurScore(iqtree.optimizeAllBranches());
 		//cout << "Log-likelihood	after reoptimizing model parameters: " << tree.curScore << endl;
-		int nni_count, nni_steps;
-		iqtree.setCurScore(iqtree.optimizeNNI(nni_count, nni_steps));
+//		pair<int, int> nniInfo = iqtree.optimizeNNI();
+        iqtree.optimizeNNI();
 		cout << "Log-likelihood after optimizing partial tree: "
 				<< iqtree.getCurScore() << endl;
 	}
@@ -1420,8 +1437,8 @@ void restoreTaxa(IQTree &iqtree, double *saved_dist_mat, NodeVector &pruned_taxa
 		iqtree.clearAllPartialLH();
 		iqtree.setCurScore(iqtree.optimizeAllBranches());
 		//cout << "Log-likelihood	after reoptimizing model parameters: " << tree.curScore << endl;
-		int nni_count, nni_steps;
-		iqtree.setCurScore(iqtree.optimizeNNI(nni_count, nni_steps));
+		pair<int, int> nniInfo;
+		nniInfo = iqtree.optimizeNNI();
 		cout << "Log-likelihood	after reoptimizing full tree: " << iqtree.getCurScore() << endl;
 		//iqtree.setBestScore(iqtree.getModelFactory()->optimizeParameters(params.fixed_branch_length, true, params.model_eps));
 
@@ -1512,10 +1529,23 @@ void printMiscInfo(Params &params, IQTree &iqtree, double *pattern_lh) {
 			printSiteLhCategory(site_lh_file.c_str(), &iqtree, params.print_site_lh);
 	}
 
+    if (params.print_partition_lh && !iqtree.isSuperTree()) {
+        outWarning("-wpl does not work with non-partition model");
+        params.print_partition_lh = false;
+    }
+	if (params.print_partition_lh && !params.pll) {
+        string part_lh_file = (string)params.out_prefix + ".partlh";
+        printPartitionLh(part_lh_file.c_str(), &iqtree, pattern_lh);
+	}
+
 	if (params.print_site_prob && !params.pll) {
         printSiteProbCategory(((string)params.out_prefix + ".siteprob").c_str(), &iqtree, params.print_site_prob);
 	}
     
+    if (params.print_ancestral_sequence) {
+        printAncestralSequences(params.out_prefix, &iqtree, params.print_ancestral_sequence);
+    }
+    
     if (params.print_site_state_freq != WSF_NONE) {
 		string site_freq_file = params.out_prefix;
 		site_freq_file += ".sitesf";
@@ -1587,7 +1617,7 @@ void printMiscInfo(Params &params, IQTree &iqtree, double *pattern_lh) {
 		cout << endl << "Computing site-specific rates by "
 				<< rate_mvh->full_name << "..." << endl;
 		rate_mvh->runIterativeProc(params, iqtree);
-		cout << endl << "BEST SCORE FOUND : " << iqtree.candidateTrees.getBestScore()<< endl;
+		cout << endl << "BEST SCORE FOUND : " << iqtree.getBestScore()<< endl;
 		string mhrate_file = params.out_prefix;
 		mhrate_file += ".mhrate";
 		iqtree.getRate()->writeSiteRates(mhrate_file.c_str());
@@ -1661,8 +1691,7 @@ void printFinalSearchInfo(Params &params, IQTree &iqtree, double search_cpu_time
 
 }
 
-void printSuboptimalTrees(IQTree& iqtree, Params& params, string suffix) {
-	vector<string> trees = iqtree.candidateTrees.getTopTrees();
+void printTrees(vector<string> trees, Params &params, string suffix) {
 	ofstream treesOut((string(params.out_prefix) + suffix).c_str(),
 			ofstream::out);
 	for (vector<string>::iterator it = trees.begin(); it != trees.end(); it++) {
@@ -1717,12 +1746,7 @@ void runTreeReconstruction(Params &params, string &original_model, IQTree &iqtre
 
     /********************** Create an initial tree **********************/
     iqtree.computeInitialTree(dist_file, params.SSE);
-    
-    //*** FOR TUNG: This is wrong! a NULL root was already treated correctly
-//    if (params.root == NULL) {
-//    	params.root = iqtree.aln->getSeqName(0).c_str();
-//    	iqtree.setRootNode(params.root);
-//    }
+
    	iqtree.setRootNode(params.root);
 
     /*************** SET UP PARAMETERS and model testing ****************/
@@ -1744,12 +1768,14 @@ void runTreeReconstruction(Params &params, string &original_model, IQTree &iqtre
     delete models_block;
 
     // UpperBounds analysis. Here, to analyse the initial tree without any tree search or optimization
+    /*
     if (params.upper_bound) {
     	iqtree.setCurScore(iqtree.computeLikelihood());
     	cout<<iqtree.getCurScore()<<endl;
     	UpperBounds(&params, iqtree.aln, &iqtree);
     	exit(0);
 	}
+    */
 
     // degree of freedom
     cout << endl;
@@ -1760,40 +1786,66 @@ void runTreeReconstruction(Params &params, string &original_model, IQTree &iqtre
     }
 
     if (!params.pll) {
-        uint64_t mem_size = iqtree.getMemoryRequired();
         uint64_t total_mem = getMemorySize();
-        if (mem_size >= total_mem) {
-            if (params.lh_mem_save == LM_DETECT) {
-                // switch to memory saving technique that reduces memory requirement to 1/3
-                params.lh_mem_save = LM_PER_NODE;
-                mem_size = iqtree.getMemoryRequired();
+        if (params.lh_mem_save == LM_MEM_SAVE && params.max_mem_size > total_mem)
+            params.max_mem_size = total_mem;
+
+        uint64_t mem_required = iqtree.getMemoryRequired();
+
+        if (mem_required >= total_mem*0.95 && !iqtree.isSuperTree()) {
+            // switch to memory saving mode
+            if (params.lh_mem_save != LM_MEM_SAVE) {
+                params.max_mem_size = (total_mem*0.95)/mem_required;
+                params.lh_mem_save = LM_MEM_SAVE;
+                mem_required = iqtree.getMemoryRequired();
+                cout << "NOTE: Switching to memory saving mode using " << (mem_required / 1073741824.0) << " GB ("
+                    <<  (mem_required*100/total_mem) << "% of normal mode)" << endl;
+                cout << "NOTE: Use -mem option if you want to restrict RAM usage further" << endl;
             }
+            if (mem_required >= total_mem) {
+                params.lh_mem_save = LM_MEM_SAVE;
+                params.max_mem_size = 0.0;
+                mem_required = iqtree.getMemoryRequired();
+            }
+        }
+        if (mem_required >= total_mem) {
+            cerr << "ERROR: Your RAM is below minimum requirement of " << (mem_required / 1073741824.0) << " GB RAM" << endl;
+            outError("Memory saving mode cannot work, switch to another computer!!!");
         }
+
 //#if defined __APPLE__ || defined __MACH__
-        cout << "NOTE: " << (mem_size / 1024) / 1024 << " MB RAM is required!" << endl;
+        cout << "NOTE: " << (mem_required / 1048576) << " MB RAM (" << (mem_required / 1073741824) << " GB) is required!" << endl;
 //#else
 //        cout << "NOTE: " << ((double) mem_size / 1000.0) / 1000 << " MB RAM is required!" << endl;
 //#endif
-        if (mem_size >= total_mem) {
-            outError("Memory required exceeds your computer RAM size!");
-        }
+		if (params.memCheck)
+			exit(0);
 #ifdef BINARY32
-        if (mem_size >= 2000000000) {
+        if (mem_required >= 2000000000) {
             outError("Memory required exceeds 2GB limit of 32-bit executable");
         }
 #endif
         int max_procs = countPhysicalCPUCores();
-        if (mem_size * max_procs > total_mem * params.num_threads) {
-            outWarning("Memory required per CPU-core (" + convertDoubleToString((double)mem_size/params.num_threads/1024/1024/1024)+
+        if (mem_required * max_procs > total_mem * iqtree.num_threads && iqtree.num_threads > 0) {
+            outWarning("Memory required per CPU-core (" + convertDoubleToString((double)mem_required/iqtree.num_threads/1024/1024/1024)+
             " GB) is higher than your computer RAM per CPU-core ("+convertIntToString(total_mem/max_procs/1024/1024/1024)+
             " GB), thus multiple runs may exceed RAM!");
         }
     }
 
+
+#ifdef _OPENMP
+    if (iqtree.num_threads <= 0) {
+        int bestThreads = iqtree.testNumThreads();
+        omp_set_num_threads(bestThreads);
+        params.num_threads = bestThreads;
+    }
+#endif
+
+
     iqtree.initializeAllPartialLh();
-	double initEpsilon = params.min_iterations == 0 ? params.modeps : (params.modeps*10);
+	double initEpsilon = params.min_iterations == 0 ? params.modelEps : (params.modelEps*10);
 
-	string initTree;
 
 	if (iqtree.getRate()->name.find("+I+G") != string::npos) {
 		if (params.alpha_invar_file != NULL) { // COMPUTE TREE LIKELIHOOD BASED ON THE INPUT ALPHA AND P_INVAR VALUE
@@ -1809,12 +1861,14 @@ void runTreeReconstruction(Params &params, string &original_model, IQTree &iqtre
 	}
 
     // Optimize model parameters and branch lengths using ML for the initial tree
+	string initTree;
 	iqtree.clearAllPartialLH();
+
     iqtree.getModelFactory()->restoreCheckpoint();
     if (iqtree.getCheckpoint()->getBool("finishedModelInit")) {
         // model optimization already done: ignore this step
         if (!iqtree.candidateTrees.empty())
-            iqtree.readTreeString(iqtree.candidateTrees.getTopTrees(1)[0]);
+            iqtree.readTreeString(iqtree.getBestTrees()[0]);
         iqtree.setCurScore(iqtree.computeLikelihood());
         initTree = iqtree.getTreeString();
         cout << "CHECKPOINT: Model parameters restored, LogL: " << iqtree.getCurScore() << endl;
@@ -1838,7 +1892,9 @@ void runTreeReconstruction(Params &params, string &original_model, IQTree &iqtre
         cout << "Likelihood mapping needed " << getRealTime()-lkmap_time << " seconds" << endl << endl;
     }
     
-    bool finishedCandidateSet = iqtree.getCheckpoint()->getBool("finishedCandidateSet");
+    // TODO: why is this variable not used? 
+    // ANSWER: moved to doTreeSearch
+//    bool finishedCandidateSet = iqtree.getCheckpoint()->getBool("finishedCandidateSet");
     bool finishedInitTree = iqtree.getCheckpoint()->getBool("finishedInitTree");
 
     // now overwrite with random tree
@@ -1854,92 +1910,64 @@ void runTreeReconstruction(Params &params, string &original_model, IQTree &iqtre
     /****************** NOW PERFORM MAXIMUM LIKELIHOOD TREE RECONSTRUCTION ******************/
 
     // Update best tree
-    if (!finishedInitTree)
-        iqtree.candidateTrees.update(initTree, iqtree.getCurScore());
-
-    if (params.min_iterations > 0) {
-        if (!iqtree.isBifurcating())
-            outError("Tree search does not work with initial multifurcating tree. Please specify `-n 0` to avoid this.");
-        cout << "--------------------------------------------------------------------" << endl;
-        cout << "|             INITIALIZING CANDIDATE TREE SET                      |" << endl;
-        cout << "--------------------------------------------------------------------" << endl;
+    if (!finishedInitTree) {
+        iqtree.addTreeToCandidateSet(initTree, iqtree.getCurScore(), false, MPIHelper::getInstance().getProcessID());
+        iqtree.printResultTree();
+        iqtree.intermediateTrees.update(iqtree.getTreeString(), iqtree.getCurScore());
     }
 
+    if (params.min_iterations && !iqtree.isBifurcating())
+        outError("Tree search does not work with initial multifurcating tree. Please specify `-n 0` to avoid this.");
+
     // Compute maximum likelihood distance
     // ML distance is only needed for IQP
 //    if ( params.start_tree != STT_BIONJ && ((params.snni && !params.iqp) || params.min_iterations == 0)) {
 //        params.compute_ml_dist = false;
 //    }
-    if (params.min_iterations <= 1 && params.start_tree != STT_BIONJ)
+    if ((params.min_iterations <= 1 || params.numInitTrees <= 1) && params.start_tree != STT_BIONJ)
         params.compute_ml_dist = false;
     
     if ((params.user_file || params.start_tree == STT_RANDOM_TREE) && params.snni && !params.iqp) {
         params.compute_ml_dist = false;
     }
-//    if ( params.user_file && params.min_iterations == 0) {
-//        params.compute_ml_dist = false;
-//    }
 
-    if (!finishedInitTree && ((!params.dist_file && params.compute_ml_dist) || params.leastSquareBranch)) {
-        computeMLDist(params, iqtree, dist_file, getCPUTime());
-        if (!params.user_file && params.start_tree != STT_RANDOM_TREE) {
-            // NEW 2015-08-10: always compute BIONJ tree into the candidate set
-            iqtree.resetCurScore();
-            double start_bionj = getRealTime();
-            iqtree.computeBioNJ(params, iqtree.aln, dist_file);
-            cout << getRealTime() - start_bionj << " seconds" << endl;
-            if (iqtree.isSuperTree())
-                iqtree.wrapperFixNegativeBranch(true);
-            else
-                iqtree.wrapperFixNegativeBranch(false);
-            if (params.start_tree == STT_BIONJ) {
-                initTree = iqtree.optimizeModelParameters(params.min_iterations==0, initEpsilon);
-            } else {
-                initTree = iqtree.optimizeBranches();
+    if (params.constraint_tree_file)
+        params.compute_ml_dist = false;
+
+	//Generate BIONJ tree
+	if (MPIHelper::getInstance().isMaster() && !iqtree.getCheckpoint()->getBool("finishedCandidateSet")) {
+        if (!finishedInitTree && ((!params.dist_file && params.compute_ml_dist) || params.leastSquareBranch)) {
+            computeMLDist(params, iqtree, dist_file, getCPUTime());
+            if (!params.user_file && params.start_tree != STT_RANDOM_TREE) {
+                // NEW 2015-08-10: always compute BIONJ tree into the candidate set
+                iqtree.resetCurScore();
+                double start_bionj = getRealTime();
+                iqtree.computeBioNJ(params, iqtree.aln, dist_file);
+                cout << getRealTime() - start_bionj << " seconds" << endl;
+                if (iqtree.isSuperTree())
+                    iqtree.wrapperFixNegativeBranch(true);
+                else
+                    iqtree.wrapperFixNegativeBranch(false);
+                if (params.start_tree == STT_BIONJ) {
+                    initTree = iqtree.optimizeModelParameters(params.min_iterations==0, initEpsilon);
+                } else {
+                    initTree = iqtree.optimizeBranches();
+                }
+                cout << "Log-likelihood of BIONJ tree: " << iqtree.getCurScore() << endl;
+                iqtree.candidateTrees.update(initTree, iqtree.getCurScore());
             }
-            cout << "Log-likelihood of BIONJ tree: " << iqtree.getCurScore() << endl;
-            iqtree.candidateTrees.update(initTree, iqtree.getCurScore());
         }
     }
-
+    
 //    iqtree.saveCheckpoint();
 
 	double cputime_search_start = getCPUTime();
     double realtime_search_start = getRealTime();
 
-    if (params.min_iterations > 0 && !finishedCandidateSet) {
-        double initTime = getCPUTime();
-
-//        if (!params.user_file && (params.start_tree == STT_PARSIMONY || params.start_tree == STT_PLL_PARSIMONY)) 
-//        {
-        	iqtree.initCandidateTreeSet(params.numInitTrees - iqtree.candidateTrees.size(), params.numNNITrees);
-        	assert(iqtree.candidateTrees.size() != 0);
-        	cout << "Finish initializing candidate tree set. ";
-        	cout << "Number of distinct locally optimal trees: " << iqtree.candidateTrees.size() << endl;
-        	if (params.write_local_optimal_trees) {
-        		printSuboptimalTrees(iqtree, params, ".init_suboptimal_trees");
-        	}
-//        }
-        cout << "Current best tree score: " << iqtree.candidateTrees.getBestScore() << " / CPU time: "
-                << getCPUTime() - initTime << endl;
-	}
-
-    if (finishedCandidateSet) {
-        cout << "CHECKPOINT: Candidate tree set restored, best LogL: " << iqtree.candidateTrees.getBestScore() << endl;
-    } else {
-        iqtree.saveCheckpoint();
-        iqtree.getCheckpoint()->putBool("finishedCandidateSet", true);
-        iqtree.getCheckpoint()->dump(true);
-    }
-
     if (params.leastSquareNNI) {
     	iqtree.computeSubtreeDists();
     }
-    /* TUNG: what happens if params.root is not set? This is usually the case.
-     * I added code to ininialize the root above.
-     */
-    //iqtree.setRootNode(params.root); // Important for NNI below
-
+	
 	if (original_model == "WHTEST") {
 		cout << endl << "Testing model homogeneity by Weiss & von Haeseler (2003)..." << endl;
 		WHTest(params, iqtree);
@@ -1955,8 +1983,8 @@ void runTreeReconstruction(Params &params, string &original_model, IQTree &iqtre
 	// prune stable taxa
 	pruneTaxa(params, iqtree, pattern_lh, pruned_taxa, linked_name);
 
-	if (params.min_iterations > 1) {
-		iqtree.readTreeString(iqtree.candidateTrees.getTopTrees()[0]);
+	/***************************************** DO STOCHASTIC TREE SEARCH *******************************************/
+	if (params.min_iterations > 0 && !params.tree_spr) {
 		iqtree.doTreeSearch();
 		iqtree.setAlignment(iqtree.aln);
         cout << "TREE SEARCH COMPLETED AFTER " << iqtree.stop_rule.getCurIt() << " ITERATIONS" 
@@ -1986,14 +2014,19 @@ void runTreeReconstruction(Params &params, string &original_model, IQTree &iqtre
 //	if (iqtree.isSuperTree())
 //			((PhyloSuperTree*) &iqtree)->mapTrees();
 
+    if (!MPIHelper::getInstance().isMaster()) {
+        delete[] pattern_lh;
+        return;
+    }
+
 	if (params.snni && params.min_iterations && verbose_mode >= VB_MED) {
-		cout << "Log-likelihoods of best " << params.popSize << " trees: " << endl;
-		iqtree.printBestScores(params.popSize);
+		cout << "Log-likelihoods of " << params.popSize << " best candidate trees: " << endl;
+		iqtree.printBestScores();
 		cout << endl;
 	}
 
 	if (params.min_iterations) {
-		iqtree.readTreeString(iqtree.candidateTrees.getBestTrees()[0]);
+		iqtree.readTreeString(iqtree.getBestTrees()[0]);
         iqtree.initializeAllPartialLh();
         iqtree.clearAllPartialLH();
         cout << "--------------------------------------------------------------------" << endl;
@@ -2006,8 +2039,12 @@ void runTreeReconstruction(Params &params, string &original_model, IQTree &iqtre
         } else {
             cout << "Performs final model parameters optimization" << endl;
             string tree;
+            Params::getInstance().fixStableSplits = false;
+            Params::getInstance().tabu = false;
+            // why doing NNI search here?
+//            iqtree.doNNISearch();
             tree = iqtree.optimizeModelParameters(true);
-            iqtree.candidateTrees.update(tree, iqtree.getCurScore(), true);
+            iqtree.addTreeToCandidateSet(tree, iqtree.getCurScore(), false, MPIHelper::getInstance().getProcessID());
             iqtree.getCheckpoint()->putBool("finishedModelFinal", true);
             iqtree.saveCheckpoint();
         }
@@ -2019,8 +2056,8 @@ void runTreeReconstruction(Params &params, string &original_model, IQTree &iqtre
 
 	cout << "BEST SCORE FOUND : " << iqtree.getCurScore() << endl;
 
-	if (params.write_local_optimal_trees) {
-		printSuboptimalTrees(iqtree, params, ".suboptimal_trees");
+	if (params.write_candidate_trees) {
+		printTrees(iqtree.getBestTrees(), params, ".imd_trees");
 	}
 
 	if (params.pll)
@@ -2077,17 +2114,17 @@ void runTreeReconstruction(Params &params, string &original_model, IQTree &iqtre
 	// BUG FIX: readTreeString(bestTreeString) not needed before this line
 	iqtree.printResultTree();
 
-	if(params.upper_bound_NNI){
-		string out_file_UB = params.out_prefix;
-		out_file_UB += ".UB.NNI.main";
-		ofstream out_UB;
-		out_UB.exceptions(ios::failbit | ios::badbit);
-		out_UB.open((char*)out_file_UB.c_str(),std::ofstream::out | std::ofstream::app);
-		out_UB<<iqtree.leafNum<<"\t"<<iqtree.aln->getNSite()<<"\t"<<iqtree.params->upper_bound_frac<<"\t"
-				  <<iqtree.skippedNNIub<<"\t"<< iqtree.totalNNIub<<"\t"<<iqtree.candidateTrees.getBestScore() <<endl;
-					//iqtree.minUB << "\t" << iqtree.meanUB/iqtree.skippedNNIub << "\t" << iqtree.maxUB << endl;
-		out_UB.close();
-		}
+    if (params.upper_bound_NNI) {
+        string out_file_UB = params.out_prefix;
+        out_file_UB += ".UB.NNI.main";
+        ofstream out_UB;
+        out_UB.exceptions(ios::failbit | ios::badbit);
+        out_UB.open((char *) out_file_UB.c_str(), std::ofstream::out | std::ofstream::app);
+        out_UB << iqtree.leafNum << "\t" << iqtree.aln->getNSite() << "\t" << iqtree.params->upper_bound_frac << "\t"
+        << iqtree.skippedNNIub << "\t" << iqtree.totalNNIub << "\t" << iqtree.getBestScore() << endl;
+        //iqtree.minUB << "\t" << iqtree.meanUB/iqtree.skippedNNIub << "\t" << iqtree.maxUB << endl;
+        out_UB.close();
+    }
 
 	if (params.out_file)
 		iqtree.printTree(params.out_file);
@@ -2152,7 +2189,7 @@ void searchGAMMAInvarByRestarting(IQTree &iqtree) {
 	if (Params::getInstance().randomAlpha) {
 		while (initAlphas.size() < 10) {
 			double initAlpha = random_double();
-			initAlphas.push_back(initAlpha + MIN_GAMMA_SHAPE*2);
+			initAlphas.push_back(initAlpha + iqtree.params->min_gamma_shape*2);
 		}
 	} else {
 		initAlphas.assign(values, values+10);
@@ -2326,6 +2363,7 @@ void runStandardBootstrap(Params &params, string &original_model, Alignment *ali
     }
     
 	double start_time = getCPUTime();
+	double start_real_time = getRealTime();
 
     
     
@@ -2351,7 +2389,7 @@ void runStandardBootstrap(Params &params, string &original_model, Alignment *ali
         finish_random();
         randstream = saved_randstream;
 
-		if (params.print_tree_lh) {
+		if (params.print_tree_lh && MPIHelper::getInstance().isMaster()) {
 			double prob;
 			bootstrap_alignment->multinomialProb(*alignment, prob);
 			ofstream boot_lh;
@@ -2371,7 +2409,7 @@ void runStandardBootstrap(Params &params, string &original_model, Alignment *ali
 			}
 		} else
 			boot_tree = new IQTree(bootstrap_alignment);
-		if (params.print_bootaln)
+		if (params.print_bootaln && MPIHelper::getInstance().isMaster())
 			bootstrap_alignment->printPhylip(bootaln_name.c_str(), true);
 
         // set checkpoint
@@ -2392,6 +2430,7 @@ void runStandardBootstrap(Params &params, string &original_model, Alignment *ali
 //			outError(ERR_READ_INPUT, treefile_name);
 //		}
 		// write the tree into .boottrees file
+        if (MPIHelper::getInstance().isMaster())
 		try {
 			ofstream tree_out;
 			tree_out.exceptions(ios::failbit | ios::badbit);
@@ -2429,7 +2468,7 @@ void runStandardBootstrap(Params &params, string &original_model, Alignment *ali
 	}
 
 
-	if (params.consensus_type == CT_CONSENSUS_TREE) {
+	if (params.consensus_type == CT_CONSENSUS_TREE && MPIHelper::getInstance().isMaster()) {
 
 		cout << endl << "===> COMPUTE CONSENSUS TREE FROM "
 				<< params.num_bootstrap_samples << " BOOTSTRAP TREES" << endl << endl;
@@ -2447,6 +2486,7 @@ void runStandardBootstrap(Params &params, string &original_model, Alignment *ali
         
 		runTreeReconstruction(params, original_model, *tree, *model_info);
 
+        if (MPIHelper::getInstance().isMaster()) {
 		cout << endl << "===> ASSIGN BOOTSTRAP SUPPORTS TO THE TREE FROM ORIGINAL ALIGNMENT" << endl << endl;
 		MExtTree ext_tree;
 		assignBootstrapSupport(boottrees_name.c_str(), 0, 1e6,
@@ -2454,7 +2494,8 @@ void runStandardBootstrap(Params &params, string &original_model, Alignment *ali
 				params.out_prefix, ext_tree, NULL, &params);
 		tree->copyTree(&ext_tree);
 		reportPhyloAnalysis(params, original_model, *tree, *model_info);
-	} else if (params.consensus_type == CT_CONSENSUS_TREE) {
+        }
+	} else if (params.consensus_type == CT_CONSENSUS_TREE && MPIHelper::getInstance().isMaster()) {
 		int mi = params.min_iterations;
 		STOP_CONDITION sc = params.stop_condition;
 		params.min_iterations = 0;
@@ -2467,7 +2508,9 @@ void runStandardBootstrap(Params &params, string &original_model, Alignment *ali
 	} else
 		cout << endl;
 
-	cout << "Total CPU time for bootstrap: " << (getCPUTime() - start_time) << " seconds." << endl << endl;
+    if (MPIHelper::getInstance().isMaster()) {
+	cout << "Total CPU time for bootstrap: " << (getCPUTime() - start_time) << " seconds." << endl;
+	cout << "Total wall-clock time for bootstrap: " << (getRealTime() - start_real_time) << " seconds." << endl << endl;
 	cout << "Non-parametric bootstrap results written to:" << endl;
 	if (params.print_bootaln)
 		cout << "  Bootstrap alignments:     " << params.out_prefix << ".bootaln" << endl;
@@ -2475,7 +2518,7 @@ void runStandardBootstrap(Params &params, string &original_model, Alignment *ali
 	if (params.consensus_type == CT_CONSENSUS_TREE)
 		cout << "  Consensus tree:           " << params.out_prefix << ".contree" << endl;
 	cout << endl;
-    
+    }
     delete model_info;
 }
 
@@ -2535,7 +2578,7 @@ void computeSiteFrequencyModel(Params &params, Alignment *alignment) {
     delete models_block;
     tree->setModel(tree->getModelFactory()->model);
     tree->setRate(tree->getModelFactory()->site_rate);
-    tree->setLikelihoodKernel(params.SSE);
+    tree->setLikelihoodKernel(params.SSE, params.num_threads);
 
     if (!tree->getModel()->isMixture())
         outError("No mixture model was specified!");
@@ -2551,8 +2594,15 @@ void computeSiteFrequencyModel(Params &params, Alignment *alignment) {
     }
 #endif
 
+#ifdef _OPENMP
+    if (tree->num_threads <= 0) {
+        int bestThreads = tree->testNumThreads();
+        omp_set_num_threads(bestThreads);
+    }
+#endif
+
     tree->initializeAllPartialLh();
-    tree->getModelFactory()->optimizeParameters(params.fixed_branch_length, true, params.modeps);
+    tree->getModelFactory()->optimizeParameters(params.fixed_branch_length, true, params.modelEps);
 
     size_t nptn = alignment->getNPattern(), nstates = alignment->num_states;
     double *ptn_state_freq = new double[nptn*nstates];
@@ -2580,7 +2630,7 @@ void computeSiteFrequencyModel(Params &params, Alignment *alignment) {
 void runPhyloAnalysis(Params &params, Checkpoint *checkpoint) {
 	Alignment *alignment;
 	IQTree *tree;
-    
+
     checkpoint->putBool("finished", false);
     checkpoint->setDumpInterval(params.checkpoint_dump_interval);
 
@@ -2647,6 +2697,16 @@ void runPhyloAnalysis(Params &params, Checkpoint *checkpoint) {
 		alignment->concatenateAlignment(&aln);
 	}
 
+    if (params.constraint_tree_file) {
+        cout << "Reading constraint tree " << params.constraint_tree_file << "..." << endl;
+        tree->constraintTree.initConstraint(params.constraint_tree_file, alignment->getSeqNames());
+        if (params.start_tree == STT_PLL_PARSIMONY)
+            params.start_tree = STT_PARSIMONY;
+        else if (params.start_tree == STT_BIONJ)
+            outError("Constraint tree does not work with -t BIONJ");
+            
+    }
+
     if (params.compute_seq_identity_along_tree) {
         if (!params.user_file)
             outError("Please supply a user tree file!");
@@ -2679,18 +2739,31 @@ void runPhyloAnalysis(Params &params, Checkpoint *checkpoint) {
 //		runBootLhTest(params, alignment, *tree);
 		outError("Obsolete feature");
 	} else if (params.num_bootstrap_samples == 0) {
-		// the main Maximum likelihood tree reconstruction
+	/********************************************************************************
+                    THE MAIN MAXIMUM LIKELIHOOD TREE RECONSTRUCTION
+	 ********************************************************************************/
 		vector<ModelInfo> *model_info = new vector<ModelInfo>;
 		alignment->checkGappySeq(params.remove_empty_seq);
 
 		// remove identical sequences
         if (params.ignore_identical_seqs) {
             tree->removeIdenticalSeqs(params);
+            if (tree->removed_seqs.size() > 0 && MPIHelper::getInstance().isMaster() && (params.suppress_output_flags & OUT_UNIQUESEQ) == 0) {
+                string filename = (string)params.out_prefix + ".uniqueseq.phy";
+                if (tree->isSuperTree())
+                    ((SuperAlignment*)tree->aln)->printCombinedAlignment(filename.c_str());
+                else
+                    tree->aln->printPhylip(filename.c_str());
+                cout << endl << "For your convenience alignment with unique sequences printed to " << filename << endl;
+            }
         }
         alignment = NULL; // from now on use tree->aln instead
 
 		// call main tree reconstruction
         runTreeReconstruction(params, original_model, *tree, *model_info);
+        
+        if (MPIHelper::getInstance().isMaster()) {
+
 		if (params.gbo_replicates && params.online_bootstrap) {
 			if (params.print_ufboot_trees)
 				tree->writeUFBootTrees(params);
@@ -2704,15 +2777,12 @@ void runPhyloAnalysis(Params &params, Checkpoint *checkpoint) {
 			string current_tree = tree->getTreeString();
 			splitsfile = params.out_prefix;
 			splitsfile += ".contree";
+
+            IntVector rfdist;
+            tree->computeRFDist(splitsfile.c_str(), rfdist);
+            params.contree_rfdist = rfdist[0];
+
 			tree->readTreeFile(splitsfile);
-			// bug fix
-//			if ((tree->sse == LK_EIGEN || tree->sse == LK_EIGEN_SSE) && !tree->isBifurcating()) {
-//				cout << "NOTE: Changing to old kernel as consensus tree is multifurcating" << endl;
-//                if (tree->sse == LK_EIGEN)
-//                    tree->changeLikelihoodKernel(LK_NORMAL);
-//                else
-//                    tree->changeLikelihoodKernel(LK_SSE);
-//			}
 
 			tree->initializeAllPartialLh();
 			tree->fixNegativeBranch(true);
@@ -2725,13 +2795,20 @@ void runPhyloAnalysis(Params &params, Checkpoint *checkpoint) {
 			// revert the best tree
 			tree->readTreeString(current_tree);
 		}
+		if (Params::getInstance().writeDistImdTrees) {
+            cout << endl;
+            cout << "Recomputing the log-likelihood of the intermediate trees ... " << endl;
+            tree->intermediateTrees.recomputeLoglOfAllTrees(*tree);
+        }
+		reportPhyloAnalysis(params, original_model, *tree, *model_info);
+        }
+
 		// reinsert identical sequences
 		if (tree->removed_seqs.size() > 0) {
 			// BUG FIX: dont use reinsertIdenticalSeqs anymore
 			tree->insertTaxa(tree->removed_seqs, tree->twin_seqs);
 			tree->printResultTree();
 		}
-		reportPhyloAnalysis(params, original_model, *tree, *model_info);
         delete model_info;
 	} else {
 		// the classical non-parameter bootstrap (SBS)
diff --git a/phyloanalysis.h b/phyloanalysis.h
index afcfede..2fd64c8 100644
--- a/phyloanalysis.h
+++ b/phyloanalysis.h
@@ -1,6 +1,8 @@
 /***************************************************************************
- *   Copyright (C) 2009 by BUI Quang Minh   *
- *   minh.bui at univie.ac.at   *
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
diff --git a/phylokernel.h b/phylokernel.h
index e0b9f7c..8c9ab66 100644
--- a/phylokernel.h
+++ b/phylokernel.h
@@ -9,9 +9,11 @@
 #define PHYLOKERNEL_H_
 
 #include "phylotree.h"
-#include "vectorclass/vectorclass.h"
-#include "vectorclass/vectormath_exp.h"
+//#include "vectorclass/vectorclass.h"
+//#include "vectorclass/vectormath_exp.h"
+#include "superalignment.h"
 
+#ifdef __SSE2__
 inline Vec2d horizontal_add(Vec2d x[2]) {
 #if  INSTRSET >= 3  // SSE3
     return _mm_hadd_pd(x[0],x[1]);
@@ -20,7 +22,7 @@ inline Vec2d horizontal_add(Vec2d x[2]) {
     Vec2d help1 = _mm_shuffle_pd(x[0], x[1], _MM_SHUFFLE2(1,1));
     return _mm_add_pd(help0, help1);
 #else
-#error "You must compile with SSE3 enabled!"
+#error "You must compile with SSE2 enabled!"
 #endif
 }
 
@@ -29,6 +31,7 @@ inline double horizontal_max(Vec2d const &a) {
     a.store(x);
     return max(x[0],x[1]);
 }
+#endif
 
 #ifdef __AVX__
 
@@ -56,10 +59,10 @@ inline double horizontal_max(Vec4d const &a) {
 
 #endif // __AVX__
 
-template <class Numeric, class VectorClass, const int VCSIZE>
+template <class Numeric, class VectorClass>
 Numeric PhyloTree::dotProductSIMD(Numeric *x, Numeric *y, int size) {
 	VectorClass res = VectorClass().load_a(x) * VectorClass().load_a(y);
-	for (int i = VCSIZE; i < size; i += VCSIZE)
+	for (int i = VectorClass::size(); i < size; i += VectorClass::size())
 		res = mul_add(VectorClass().load_a(&x[i]), VectorClass().load_a(&y[i]), res);
 	return horizontal_add(res);
 }
@@ -70,16 +73,10 @@ Numeric PhyloTree::dotProductSIMD(Numeric *x, Numeric *y, int size) {
  *
  *************************************************************************************************/
 
-
+/*
 template <class VectorClass, const int VCSIZE, const int nstates>
 void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
 
-    if (dad_branch->node->degree() > 3) {
-        // TODO: SIMD version for multifurcating node
-        computePartialLikelihoodEigen(dad_branch, dad);
-        return;
-    }
-
     // don't recompute the likelihood
 	assert(dad);
     if (dad_branch->partial_lh_computed & 1)
@@ -101,33 +98,38 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 	}
 
     size_t ptn, c;
-    size_t orig_ntn = aln->size();
+    size_t orig_nptn = aln->size();
 
     size_t ncat = site_rate->getNRate();
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
     assert(nstates == aln->num_states && nstates >= VCSIZE && VCSIZE == VectorClass().size());
     assert(model->isReversible()); // only works with reversible model!
     const size_t nstatesqr=nstates*nstates;
     size_t i, x, j;
-    size_t block = nstates * ncat;
+    size_t block = nstates * ncat_mix;
+    size_t tip_block = nstates * model->getNMixtures();
+
+    size_t mix_addr_nstates[ncat_mix], mix_addr[ncat_mix];
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+    for (c = 0; c < ncat_mix; c++) {
+        size_t m = c/denom;
+        mix_addr_nstates[c] = m*nstates;
+        mix_addr[c] = m*nstatesqr;
+    }
 
 	// internal node
-	assert(node->degree() == 3); // it works only for strictly bifurcating tree
+    dad_branch->lh_scale_factor = 0.0;
 	PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
+    int num_leaves = 0;
 	FOR_NEIGHBOR_IT(node, dad, it) {
+        PhyloNeighbor *nei = (PhyloNeighbor*)*it;
 		if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
+        if ((nei->partial_lh_computed & 1) == 0)
+            computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(nei, node);
+        dad_branch->lh_scale_factor += nei->lh_scale_factor;
+        if ((*it)->node->isLeaf()) num_leaves++;
 	}
 
-	if (!left->node->isLeaf() && right->node->isLeaf()) {
-		// swap left and right
-		PhyloNeighbor *tmp = left;
-		left = right;
-		right = tmp;
-	}
-	if ((left->partial_lh_computed & 1) == 0)
-		computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(left, node);
-	if ((right->partial_lh_computed & 1) == 0)
-		computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(right, node);
-
     if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
         // re-orient partial_lh
         bool done = false;
@@ -149,103 +151,209 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 	double *evec = model->getEigenvectors();
 	double *inv_evec = model->getInverseEigenvectors();
 
-	VectorClass vc_inv_evec[nstates*nstates/VCSIZE];
 	assert(inv_evec && evec);
-	for (i = 0; i < nstates; i++) {
-		for (x = 0; x < nstates/VCSIZE; x++)
-			// inv_evec is not aligned!
-			vc_inv_evec[i*nstates/VCSIZE+x].load_a(&inv_evec[i*nstates+x*VCSIZE]);
-	}
+//	for (i = 0; i < tip_block; i++) {
+//		for (x = 0; x < nstates/VCSIZE; x++)
+//			// inv_evec is not aligned!
+//			vc_inv_evec[i*nstates/VCSIZE+x].load_a(&inv_evec[i*nstates+x*VCSIZE]);
+//	}
 	double *eval = model->getEigenvalues();
 
-	dad_branch->lh_scale_factor = left->lh_scale_factor + right->lh_scale_factor;
 
-	VectorClass *eleft = (VectorClass*)aligned_alloc<double>(block*nstates);
-	VectorClass *eright = (VectorClass*)aligned_alloc<double>(block*nstates);
+    VectorClass *echildren = aligned_alloc<VectorClass>(block*nstates/VCSIZE*(node->degree()-1));
+    double *partial_lh_leaves = NULL;
+    if (num_leaves > 0)
+        partial_lh_leaves = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block*num_leaves);
+    VectorClass *echild = echildren;
+    double *partial_lh_leaf = partial_lh_leaves;
+    
+    
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        VectorClass expchild[nstates/VCSIZE];
+        PhyloNeighbor *child = (PhyloNeighbor*)*it;
+        VectorClass *echild_ptr = echild;
+        // precompute information buffer
+        for (c = 0; c < ncat_mix; c++) {
+            VectorClass len_child = site_rate->getRate(c%ncat) * child->length;
+            double *eval_ptr = eval + mix_addr_nstates[c];
+            double *evec_ptr = evec + mix_addr[c];
+            for (i = 0; i < nstates/VCSIZE; i++) {
+                // eval is not aligned!
+                expchild[i] = exp(VectorClass().load_a(&eval_ptr[i*VCSIZE]) * len_child);
+            }
+            for (x = 0; x < nstates; x++) {
+                for (i = 0; i < nstates/VCSIZE; i++) {
+                    // evec is not be aligned!
+                    echild_ptr[i] = (VectorClass().load_a(&evec_ptr[x*nstates+i*VCSIZE]) * expchild[i]);
+                }
+                echild_ptr += nstates/VCSIZE;
+            }
+        }
 
-	// precompute information buffer
-	for (c = 0; c < ncat; c++) {
-		VectorClass vc_evec;
-		VectorClass expleft[nstates/VCSIZE];
-		VectorClass expright[nstates/VCSIZE];
-		double len_left = site_rate->getRate(c) * left->length;
-		double len_right = site_rate->getRate(c) * right->length;
-		for (i = 0; i < nstates/VCSIZE; i++) {
-			// eval is not aligned!
-			expleft[i] = exp(VectorClass().load_a(&eval[i*VCSIZE]) * VectorClass(len_left));
-			expright[i] = exp(VectorClass().load_a(&eval[i*VCSIZE]) * VectorClass(len_right));
-		}
-		for (x = 0; x < nstates; x++)
-			for (i = 0; i < nstates/VCSIZE; i++) {
-				// evec is not be aligned!
-				vc_evec.load_a(&evec[x*nstates+i*VCSIZE]);
-				eleft[c*nstatesqr/VCSIZE+x*nstates/VCSIZE+i] = (vc_evec * expleft[i]);
-				eright[c*nstatesqr/VCSIZE+x*nstates/VCSIZE+i] = (vc_evec * expright[i]);
-			}
+        // pre compute information for tip
+        if (child->node->isLeaf()) {
+            vector<int>::iterator it;
+            for (it = aln->seq_states[child->node->id].begin(); it != aln->seq_states[child->node->id].end(); it++) {
+                int state = (*it);
+                double *this_partial_lh_leaf = partial_lh_leaf + state*block;
+                VectorClass *echild_ptr = echild;
+                for (c = 0; c < ncat_mix; c++) {
+                    VectorClass *this_tip_partial_lh = (VectorClass*)(tip_partial_lh + state*tip_block + mix_addr_nstates[c]);
+                    for (x = 0; x < nstates; x++) {
+                        VectorClass vchild = 0.0;
+                        for (i = 0; i < nstates/VCSIZE; i++) {
+                            vchild += echild_ptr[i] * this_tip_partial_lh[i];
+                        }
+                        this_partial_lh_leaf[x] = horizontal_add(vchild);
+                        echild_ptr += nstates/VCSIZE;
+                    }
+                    this_partial_lh_leaf += nstates;
+                }
+            }
+            size_t addr = aln->STATE_UNKNOWN * block;
+            for (x = 0; x < block; x++) {
+                partial_lh_leaf[addr+x] = 1.0;
+            }
+            partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
+        }
+        echild += block*nstates/VCSIZE;
+    }
+    
+    VectorClass *eleft = echildren, *eright = echildren + block*nstates/VCSIZE;
+    
+	if (!left->node->isLeaf() && right->node->isLeaf()) {
+		PhyloNeighbor *tmp = left;
+		left = right;
+		right = tmp;
+        VectorClass *etmp = eleft;
+        eleft = eright;
+        eright = etmp;
 	}
+    
+    
+    if (node->degree() > 3) {
 
-	if (left->node->isLeaf() && right->node->isLeaf()) {
+        //--------------------- multifurcating node ------------------//
+        double sum_scale = 0.0;
+        // now for-loop computing partial_lh over all site-patterns
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
+#endif
+        for (ptn = 0; ptn < nptn; ptn++) {
+            double partial_lh_all[block];
+            for (i = 0; i < block; i++)
+                partial_lh_all[i] = 1.0;
+            dad_branch->scale_num[ptn] = 0;
+                
+            double *partial_lh_leaf = partial_lh_leaves;
+            double *echild = (double*)echildren;
+
+            FOR_NEIGHBOR_IT(node, dad, it) {
+                PhyloNeighbor *child = (PhyloNeighbor*)*it;
+                if (child->node->isLeaf()) {
+                    // external node
+                    int state_child = (ptn < orig_nptn) ? (aln->at(ptn))[child->node->id] : model_factory->unobserved_ptns[ptn-orig_nptn];
+                    double *child_lh = partial_lh_leaf + state_child*block;
+                    for (c = 0; c < block; c++) {
+                        // compute real partial likelihood vector
+                        partial_lh_all[c] *= child_lh[c];
+                    }
+                    partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
+                } else {
+                    // internal node
+                    double *partial_lh = partial_lh_all;
+                    double *partial_lh_child = child->partial_lh + ptn*block;
+                    dad_branch->scale_num[ptn] += child->scale_num[ptn];
+
+                    double *echild_ptr = echild;
+                    for (c = 0; c < ncat_mix; c++) {
+                        // compute real partial likelihood vector
+                        for (x = 0; x < nstates; x++) {
+                            double vchild = 0.0;
+//                            double *echild_ptr = echild + (c*nstatesqr+x*nstates);
+                            for (i = 0; i < nstates; i++) {
+                                vchild += echild_ptr[i] * partial_lh_child[i];
+                            }
+                            echild_ptr += nstates;
+                            partial_lh[x] *= vchild;
+                        }
+                        partial_lh += nstates;
+                        partial_lh_child += nstates;
+                    }
+                } // if
+                echild += block*nstates;
+            } // FOR_NEIGHBOR
+            
+        
+            // compute dot-product with inv_eigenvector
+            double lh_max = 0.0;
+            double *partial_lh_tmp = partial_lh_all;
+            double *partial_lh = dad_branch->partial_lh + ptn*block;
+            for (c = 0; c < ncat_mix; c++) {
+                double *inv_evec_ptr = inv_evec + mix_addr[c];
+                for (i = 0; i < nstates; i++) {
+                    double res = 0.0;
+                    for (x = 0; x < nstates; x++) {
+                        res += partial_lh_tmp[x]*inv_evec_ptr[x];
+                    }
+                    inv_evec_ptr += nstates;
+                    partial_lh[i] = res;
+                    lh_max = max(lh_max, fabs(res));
+                }
+                partial_lh += nstates;
+                partial_lh_tmp += nstates;
+            }
+            // check if one should scale partial likelihoods
+            if (lh_max < SCALING_THRESHOLD) {
+                partial_lh = dad_branch->partial_lh + ptn*block;
+                if (lh_max == 0.0) {
+                    // for very shitty data
+                    for (c = 0; c < ncat_mix; c++)
+                        memcpy(&partial_lh[c*nstates], &tip_partial_lh[aln->STATE_UNKNOWN*nstates], nstates*sizeof(double));
+                    sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
+                    //sum_scale += log(lh_max) * ptn_freq[ptn];
+                    dad_branch->scale_num[ptn] += 4;
+                    int nsite = aln->getNSite();
+                    for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
+                        if (aln->getPatternID(i) == ptn) {
+                            outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
+                            x++;
+                        }
+                } else if (ptn_invar[ptn] == 0.0) {
+                    // now do the likelihood scaling
+                    for (i = 0; i < block; i++) {
+                        partial_lh[i] *= SCALING_THRESHOLD_INVER;
+                        //partial_lh[i] /= lh_max;
+                    }
+                    // unobserved const pattern will never have underflow
+                    sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+                    //sum_scale += log(lh_max) * ptn_freq[ptn];
+                    dad_branch->scale_num[ptn] += 1;
+                }
+            }
+
+        } // for ptn
+        dad_branch->lh_scale_factor += sum_scale;               
+                
+        // end multifurcating treatment
+    } else if (left->node->isLeaf() && right->node->isLeaf()) {
 		// special treatment for TIP-TIP (cherry) case
 
 		// pre compute information for both tips
-		double *partial_lh_left = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
-		double *partial_lh_right = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
-
-		vector<int>::iterator it;
-		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
-			int state = (*it);
-			VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
-			VectorClass vleft[VCSIZE];
-			size_t addr = state*nstates;
-			for (i = 0; i < nstates/VCSIZE; i++)
-				vc_partial_lh_tmp[i].load_a(&tip_partial_lh[addr+i*VCSIZE]);
-			for (x = 0; x < block; x+=VCSIZE) {
-				addr = x*nstates/VCSIZE;
-				for (j = 0; j < VCSIZE; j++)
-					vleft[j] = eleft[addr+j*nstates/VCSIZE] * vc_partial_lh_tmp[0];
-				for (i = 1; i < nstates/VCSIZE; i++) {
-					for (j = 0; j < VCSIZE; j++)
-						vleft[j] = mul_add(eleft[addr+j*nstates/VCSIZE+i], vc_partial_lh_tmp[i], vleft[j]);
-				}
-				horizontal_add(vleft).store_a(&partial_lh_left[state*block+x]);
-			}
-		}
-
-		for (it = aln->seq_states[right->node->id].begin(); it != aln->seq_states[right->node->id].end(); it++) {
-			int state = (*it);
-			VectorClass vright[VCSIZE];
-			VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
-
-			for (i = 0; i < nstates/VCSIZE; i++)
-				vc_partial_lh_tmp[i].load_a(&tip_partial_lh[state*nstates+i*VCSIZE]);
-			for (x = 0; x < block; x+=VCSIZE) {
-				for (j = 0; j < VCSIZE; j++)
-					vright[j] = eright[(x+j)*nstates/VCSIZE] * vc_partial_lh_tmp[0];
-				for (i = 1; i < nstates/VCSIZE; i++) {
-					for (j = 0; j < VCSIZE; j++)
-						vright[j] = mul_add(eright[(x+j)*nstates/VCSIZE+i], vc_partial_lh_tmp[i], vright[j]);
-				}
-				horizontal_add(vright).store_a(&partial_lh_right[state*block+x]);
-			}
-		}
-
-		size_t addr_unknown = aln->STATE_UNKNOWN * block;
-		for (x = 0; x < block; x++) {
-			partial_lh_left[addr_unknown+x] = 1.0;
-			partial_lh_right[addr_unknown+x] = 1.0;
-		}
+		double *partial_lh_left = partial_lh_leaves;
+		double *partial_lh_right = partial_lh_leaves + (aln->STATE_UNKNOWN+1)*block;
 
 		// assign pointers for left and right partial_lh
-		double **lh_left_ptr = aligned_alloc<double*>(nptn);
-		double **lh_right_ptr = aligned_alloc<double*>(nptn);
-		for (ptn = 0; ptn < orig_ntn; ptn++) {
-			lh_left_ptr[ptn] = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
-			lh_right_ptr[ptn] = &partial_lh_right[block * (aln->at(ptn))[right->node->id]];
-		}
-		for (ptn = orig_ntn; ptn < nptn; ptn++) {
-			lh_left_ptr[ptn] = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
-			lh_right_ptr[ptn] = &partial_lh_right[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
-		}
+//		double **lh_left_ptr = aligned_alloc<double*>(nptn);
+//		double **lh_right_ptr = aligned_alloc<double*>(nptn);
+//		for (ptn = 0; ptn < orig_ntn; ptn++) {
+//			lh_left_ptr[ptn] = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
+//			lh_right_ptr[ptn] = &partial_lh_right[block * (aln->at(ptn))[right->node->id]];
+//		}
+//		for (ptn = orig_ntn; ptn < nptn; ptn++) {
+//			lh_left_ptr[ptn] = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
+//			lh_right_ptr[ptn] = &partial_lh_right[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
+//		}
 
 		// scale number must be ZERO
 	    memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
@@ -258,9 +366,17 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 		for (ptn = 0; ptn < nptn; ptn++) {
 	        double *partial_lh = dad_branch->partial_lh + ptn*block;
 
-	        double *lh_left = lh_left_ptr[ptn];
-	        double *lh_right = lh_right_ptr[ptn];
-			for (c = 0; c < ncat; c++) {
+	        double *lh_left;
+	        double *lh_right;
+            if (ptn < orig_nptn) {
+                lh_left = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
+                lh_right = &partial_lh_right[block *  (aln->at(ptn))[right->node->id]];
+            } else {
+                lh_left = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_nptn]];
+                lh_right = &partial_lh_right[block * model_factory->unobserved_ptns[ptn-orig_nptn]];
+            }
+			for (c = 0; c < ncat_mix; c++) {
+                VectorClass *vc_inv_evec_ptr = (VectorClass*)(inv_evec + mix_addr[c]);
 				// compute real partial likelihood vector
 
 				for (x = 0; x < nstates/VCSIZE; x++) {
@@ -269,11 +385,11 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 				// compute dot-product with inv_eigenvector
 				for (i = 0; i < nstates; i+=VCSIZE) {
 					for (j = 0; j < VCSIZE; j++) {
-						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec[(i+j)*nstates/VCSIZE];
+						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec_ptr[(i+j)*nstates/VCSIZE];
 					}
 					for (x = 1; x < nstates/VCSIZE; x++)
 						for (j = 0; j < VCSIZE; j++) {
-							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec[(i+j)*nstates/VCSIZE+x], res[j]);
+							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec_ptr[(i+j)*nstates/VCSIZE+x], res[j]);
 						}
 					horizontal_add(res).store_a(&partial_lh[i]);
 				}
@@ -284,52 +400,26 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 			}
 		}
 
-	    aligned_free(lh_left_ptr);
-	    aligned_free(lh_right_ptr);
-		aligned_free(partial_lh_right);
-		aligned_free(partial_lh_left);
+	    //aligned_free(lh_right_ptr);
+	    //aligned_free(lh_left_ptr);
 	} else if (left->node->isLeaf() && !right->node->isLeaf()) {
 		// special treatment to TIP-INTERNAL NODE case
 		// only take scale_num from the right subtree
 		memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
 
 		// pre compute information for left tip
-		double *partial_lh_left = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+		double *partial_lh_left = partial_lh_leaves;
 
 
-		vector<int>::iterator it;
-		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
-			int state = (*it);
-			VectorClass vc_tip_lh[nstates/VCSIZE];
-			VectorClass vleft[VCSIZE];
-			for (i = 0; i < nstates/VCSIZE; i++)
-				vc_tip_lh[i].load_a(&tip_partial_lh[state*nstates+i*VCSIZE]);
-			for (x = 0; x < block; x+=VCSIZE) {
-				for (j = 0; j < VCSIZE; j++)
-					vleft[j] = eleft[(x+j)*nstates/VCSIZE] * vc_tip_lh[0];
-				for (i = 1; i < nstates/VCSIZE; i++) {
-					for (j = 0; j < VCSIZE; j++)
-						vleft[j] = mul_add(eleft[(x+j)*nstates/VCSIZE+i], vc_tip_lh[i], vleft[j]);
-				}
-				horizontal_add(vleft).store_a(&partial_lh_left[state*block+x]);
-			}
-		}
-
-		size_t addr_unknown = aln->STATE_UNKNOWN * block;
-		for (x = 0; x < block; x++) {
-			partial_lh_left[addr_unknown+x] = 1.0;
-		}
-
 		// assign pointers for partial_lh_left
-		double **lh_left_ptr = aligned_alloc<double*>(nptn);
-		for (ptn = 0; ptn < orig_ntn; ptn++) {
-			lh_left_ptr[ptn] = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
-		}
-		for (ptn = orig_ntn; ptn < nptn; ptn++) {
-			lh_left_ptr[ptn] = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
-		}
-
-		double sum_scale = 0.0;
+//		double **lh_left_ptr = aligned_alloc<double*>(nptn);
+//		for (ptn = 0; ptn < orig_ntn; ptn++) {
+//			lh_left_ptr[ptn] = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
+//		}
+//		for (ptn = orig_ntn; ptn < nptn; ptn++) {
+//			lh_left_ptr[ptn] = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
+//		}
+  		double sum_scale = 0.0;
 		VectorClass vc_lh_right[nstates/VCSIZE];
 		VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
 		VectorClass res[VCSIZE];
@@ -343,9 +433,15 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 	        double *partial_lh = dad_branch->partial_lh + ptn*block;
 	        double *partial_lh_right = right->partial_lh + ptn*block;
 
-	        double *lh_left = lh_left_ptr[ptn];
+	        double *lh_left;
+            if (ptn < orig_nptn) {
+                lh_left = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
+            } else {
+                lh_left = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_nptn]];
+            }
 			vc_max = 0.0;
-			for (c = 0; c < ncat; c++) {
+			for (c = 0; c < ncat_mix; c++) {
+                VectorClass *vc_inv_evec_ptr = (VectorClass*)(inv_evec + mix_addr[c]);
 				// compute real partial likelihood vector
 				for (i = 0; i < nstates/VCSIZE; i++)
 					vc_lh_right[i].load_a(&partial_lh_right[i*VCSIZE]);
@@ -365,11 +461,11 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 				// compute dot-product with inv_eigenvector
 				for (i = 0; i < nstates; i+=VCSIZE) {
 					for (j = 0; j < VCSIZE; j++) {
-						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec[(i+j)*nstates/VCSIZE];
+						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec_ptr[(i+j)*nstates/VCSIZE];
 					}
 					for (x = 1; x < nstates/VCSIZE; x++) {
 						for (j = 0; j < VCSIZE; j++) {
-							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec[(i+j)*nstates/VCSIZE+x], res[j]);
+							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec_ptr[(i+j)*nstates/VCSIZE+x], res[j]);
 						}
 					}
 					VectorClass sum_res = horizontal_add(res);
@@ -398,8 +494,7 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 		}
 		dad_branch->lh_scale_factor += sum_scale;
 
-	    aligned_free(lh_left_ptr);
-		aligned_free(partial_lh_left);
+	    //aligned_free(lh_left_ptr);
 
 	} else {
 		// both left and right are internal node
@@ -421,7 +516,8 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 
 			dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
 			vc_max = 0.0;
-			for (c = 0; c < ncat; c++) {
+			for (c = 0; c < ncat_mix; c++) {
+                VectorClass *vc_inv_evec_ptr = (VectorClass*)(inv_evec + mix_addr[c]);
 				// compute real partial likelihood vector
 				for (i = 0; i < nstates/VCSIZE; i++) {
 					vc_lh_left[i].load_a(&partial_lh_left[i*VCSIZE]);
@@ -447,11 +543,11 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 				// compute dot-product with inv_eigenvector
 				for (i = 0; i < nstates; i+=VCSIZE) {
 					for (j = 0; j < VCSIZE; j++) {
-						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec[(i+j)*nstates/VCSIZE];
+						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec_ptr[(i+j)*nstates/VCSIZE];
 					}
 					for (x = 1; x < nstates/VCSIZE; x++)
 						for (j = 0; j < VCSIZE; j++)
-							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec[(i+j)*nstates/VCSIZE+x], res[j]);
+							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec_ptr[(i+j)*nstates/VCSIZE+x], res[j]);
 
 					VectorClass sum_res = horizontal_add(res);
 					sum_res.store_a(&partial_lh[i]);
@@ -482,8 +578,9 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 
 	}
 
-	aligned_free(eright);
-	aligned_free(eleft);
+	if (partial_lh_leaves)
+        aligned_free(partial_lh_leaves);
+	aligned_free(echildren);
 }
 
 template <class VectorClass, const int VCSIZE, const int nstates>
@@ -506,14 +603,19 @@ void PhyloTree::computeLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloN
         computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(node_branch, node);
     df = ddf = 0.0;
     size_t ncat = site_rate->getNRate();
-
-    size_t block = ncat * nstates;
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
+    size_t block = ncat_mix * nstates;
+    size_t tip_block = nstates * model->getNMixtures();
     size_t ptn; // for big data size > 4GB memory required
     size_t c, i, j;
     size_t orig_nptn = aln->size();
     size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
     size_t maxptn = ((nptn+VCSIZE-1)/VCSIZE)*VCSIZE;
     maxptn = max(maxptn, aln->size()+((model_factory->unobserved_ptns.size()+VCSIZE-1)/VCSIZE)*VCSIZE);
+
+    size_t mix_addr_nstates[ncat_mix];
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+
     double *eval = model->getEigenvalues();
     assert(eval);
 
@@ -522,11 +624,15 @@ void PhyloTree::computeLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloN
 	VectorClass *vc_val2 = (VectorClass*)aligned_alloc<double>(block);
 
 	VectorClass vc_len = dad_branch->length;
-	for (c = 0; c < ncat; c++) {
-		VectorClass vc_rate = site_rate->getRate(c);
-		VectorClass vc_prop = site_rate->getProp(c);
+	for (c = 0; c < ncat_mix; c++) {
+        size_t m = c/denom;
+        mix_addr_nstates[c] = m*nstates;
+        size_t mycat = c%ncat;
+        double *eval_ptr = eval + m*nstates;
+		VectorClass vc_rate = site_rate->getRate(mycat);
+		VectorClass vc_prop = site_rate->getProp(mycat) * model->getMixtureWeight(m);
 		for (i = 0; i < nstates/VCSIZE; i++) {
-			VectorClass cof = VectorClass().load_a(&eval[i*VCSIZE]) * vc_rate;
+			VectorClass cof = VectorClass().load_a(&eval_ptr[i*VCSIZE]) * vc_rate;
 			VectorClass val = exp(cof*vc_len) * vc_prop;
 			VectorClass val1_ = cof*val;
 			vc_val0[c*nstates/VCSIZE+i] = val;
@@ -543,24 +649,20 @@ void PhyloTree::computeLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloN
 		if (dad->isLeaf()) {
 	    	// special treatment for TIP-INTERNAL NODE case
 #ifdef _OPENMP
-#pragma omp parallel for private(ptn, i)
+#pragma omp parallel for private(ptn, i, c)
 #endif
-			for (ptn = 0; ptn < orig_nptn; ptn++) {
+			for (ptn = 0; ptn < nptn; ptn++) {
 			    double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
 				double *theta = theta_all + ptn*block;
-				double *lh_dad = &tip_partial_lh[(aln->at(ptn))[dad->id] * nstates];
-				for (i = 0; i < block; i+=VCSIZE) {
-					(VectorClass().load_a(&lh_dad[i%nstates]) * VectorClass().load_a(&partial_lh_dad[i])).store_a(&theta[i]);
-				}
-			}
-			// ascertainment bias correction
-			for (ptn = orig_nptn; ptn < nptn; ptn++) {
-			    double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
-				double *theta = theta_all + ptn*block;
-				double *lh_dad = &tip_partial_lh[model_factory->unobserved_ptns[ptn-orig_nptn] * nstates];
-				for (i = 0; i < block; i+=VCSIZE) {
-					(VectorClass().load_a(&lh_dad[i%nstates]) * VectorClass().load_a(&partial_lh_dad[i])).store_a(&theta[i]);
-				}
+                double *this_tip_partial_lh = tip_partial_lh + tip_block*((ptn < orig_nptn) ? (aln->at(ptn))[dad->id] :  model_factory->unobserved_ptns[ptn-orig_nptn]);
+                for (c = 0; c < ncat_mix; c++) {
+                    double *lh_dad = this_tip_partial_lh + mix_addr_nstates[c];
+                    for (i = 0; i < nstates; i+=VCSIZE) {
+                        (VectorClass().load_a(&lh_dad[i]) * VectorClass().load_a(&partial_lh_dad[i])).store_a(&theta[i]);
+                    }
+                    partial_lh_dad += nstates;
+                    theta += nstates;
+                }
 			}
 	    } else {
 	    	// both dad and node are internal nodes
@@ -754,8 +856,12 @@ double PhyloTree::computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, Ph
         computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(node_branch, node);
     double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
     size_t ncat = site_rate->getNRate();
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+    size_t mix_addr_nstates[ncat_mix];
 
-    size_t block = ncat * nstates;
+    size_t block = ncat_mix * nstates;
+    size_t tip_block = nstates * model->getNMixtures();
     size_t ptn; // for big data size > 4GB memory required
     size_t c, i, j;
     size_t orig_nptn = aln->size();
@@ -768,13 +874,16 @@ double PhyloTree::computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, Ph
     VectorClass *vc_val = (VectorClass*)aligned_alloc<double>(block);
 
 
-	for (c = 0; c < ncat; c++) {
-		double len = site_rate->getRate(c)*dad_branch->length;
-		VectorClass vc_len(len);
-		VectorClass vc_prop(site_rate->getProp(c));
+	for (c = 0; c < ncat_mix; c++) {
+        size_t mycat = c%ncat;
+        size_t m = c/denom;
+        mix_addr_nstates[c] = m*nstates;
+        double *eval_ptr = eval + mix_addr_nstates[c];
+		VectorClass vc_len(site_rate->getRate(mycat)*dad_branch->length);
+		VectorClass vc_prop(site_rate->getProp(c) * model->getMixtureWeight(m));
 		for (i = 0; i < nstates/VCSIZE; i++) {
 			// eval is not aligned!
-			vc_val[c*nstates/VCSIZE+i] = exp(VectorClass().load_a(&eval[i*VCSIZE]) * vc_len) * vc_prop;
+			vc_val[c*nstates/VCSIZE+i] = exp(VectorClass().load_a(&eval_ptr[i*VCSIZE]) * vc_len) * vc_prop;
 		}
 	}
 
@@ -782,51 +891,90 @@ double PhyloTree::computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, Ph
 
 	if (dad->isLeaf()) {
     	// special treatment for TIP-INTERNAL NODE case
-    	VectorClass vc_tip_partial_lh[nstates];
-    	VectorClass vc_partial_lh_dad[VCSIZE], vc_ptn[VCSIZE];
+
+    	// precompute information from one tip        
+    	double *partial_lh_node = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+    	IntVector states_dad = aln->seq_states[dad->id];
+    	states_dad.push_back(aln->STATE_UNKNOWN);
+    	for (IntVector::iterator it = states_dad.begin(); it != states_dad.end(); it++) {
+    		double *lh_node = partial_lh_node + (*it)*block;
+    		double *lh_tip = tip_partial_lh + (*it)*tip_block;
+    		VectorClass *vc_val_tmp = vc_val;
+            for (c = 0; c < ncat_mix; c++) {
+                double *this_lh_tip = lh_tip + mix_addr_nstates[c];
+                for (i = 0; i < nstates; i+=VCSIZE) {
+                    (vc_val_tmp[i/VCSIZE] * VectorClass().load_a(&this_lh_tip[i])).store_a(&lh_node[i]);
+                }
+                lh_node += nstates;
+                vc_val_tmp += nstates/VCSIZE;
+            }
+    	}
+
+
+    	//VectorClass vc_tip_partial_lh[nstates];
+    	//VectorClass vc_partial_lh_dad[VCSIZE]
+        VectorClass vc_ptn[VCSIZE];
     	VectorClass lh_final(0.0), vc_freq;
 		VectorClass lh_ptn; // store likelihoods of VCSIZE consecutive patterns
-
-    	double **lh_states_dad = aligned_alloc<double*>(maxptn);
-    	for (ptn = 0; ptn < orig_nptn; ptn++)
-    		lh_states_dad[ptn] = &tip_partial_lh[(aln->at(ptn))[dad->id] * nstates];
-    	for (ptn = orig_nptn; ptn < nptn; ptn++)
-    		lh_states_dad[ptn] = &tip_partial_lh[model_factory->unobserved_ptns[ptn-orig_nptn] * nstates];
-    	// initialize beyond #patterns for efficiency
-    	for (ptn = nptn; ptn < maxptn; ptn++)
-    		lh_states_dad[ptn] = &tip_partial_lh[aln->STATE_UNKNOWN * nstates];
+//    	double **lh_states_dad = aligned_alloc<double*>(maxptn);
+//    	for (ptn = 0; ptn < orig_nptn; ptn++)
+//    		lh_states_dad[ptn] = &tip_partial_lh[(aln->at(ptn))[dad->id] * tip_block];
+//    	for (ptn = orig_nptn; ptn < nptn; ptn++)
+//    		lh_states_dad[ptn] = &tip_partial_lh[model_factory->unobserved_ptns[ptn-orig_nptn] * tip_block];
+//    	// initialize beyond #patterns for efficiency
+//    	for (ptn = nptn; ptn < maxptn; ptn++)
+//    		lh_states_dad[ptn] = &tip_partial_lh[aln->STATE_UNKNOWN * tip_block];
+		int *ptn_states_dad = aligned_alloc<int>(maxptn);
+		for (ptn = 0; ptn < orig_nptn; ptn++)
+			ptn_states_dad[ptn] = (aln->at(ptn))[dad->id];
+		for (ptn = orig_nptn; ptn < nptn; ptn++)
+			ptn_states_dad[ptn] = model_factory->unobserved_ptns[ptn-orig_nptn];
+		// initialize beyond #patterns for efficiency
+		for (ptn = nptn; ptn < maxptn; ptn++)
+			ptn_states_dad[ptn] = aln->STATE_UNKNOWN;
 
 		// copy dummy values because VectorClass will access beyond nptn
 		for (ptn = nptn; ptn < maxptn; ptn++)
 			memcpy(&dad_branch->partial_lh[ptn*block], dad_branch->partial_lh, block*sizeof(double));
 
 #ifdef _OPENMP
-#pragma omp parallel private(ptn, i, j, vc_tip_partial_lh, vc_partial_lh_dad, vc_ptn, vc_freq, lh_ptn)
+#pragma omp parallel private(ptn, i, j, vc_ptn, vc_freq, lh_ptn)
     {
     	VectorClass lh_final_th = 0.0;
 #pragma omp for nowait
 #endif
    		// main loop over all patterns with a step size of VCSIZE
 		for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
-			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+			//double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
 
-			// initialize vc_tip_partial_lh
 			for (j = 0; j < VCSIZE; j++) {
-				double *lh_dad = lh_states_dad[ptn+j];
-				for (i = 0; i < nstates/VCSIZE; i++) {
-					vc_tip_partial_lh[j*(nstates/VCSIZE)+i].load_a(&lh_dad[i*VCSIZE]);
+				vc_ptn[j] = 0.0;
+				double *partial_lh_dad = dad_branch->partial_lh + (ptn+j)*block;
+				int state_dad = ptn_states_dad[ptn+j];
+				double *lh_node = &partial_lh_node[state_dad*block];
+				for (i = 0; i < block; i+=VCSIZE) {
+					vc_ptn[j] = mul_add(VectorClass().load_a(&lh_node[i]),
+							VectorClass().load_a(&partial_lh_dad[i]), vc_ptn[j]);
 				}
-				vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block]);
-				vc_ptn[j] = vc_val[0] * vc_tip_partial_lh[j*(nstates/VCSIZE)] * vc_partial_lh_dad[j];
 			}
 
-			// compute vc_ptn
-			for (i = 1; i < block/VCSIZE; i++)
-				for (j = 0; j < VCSIZE; j++) {
-					vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block+i*VCSIZE]);
-					vc_ptn[j] = mul_add(vc_val[i] * vc_tip_partial_lh[j*(nstates/VCSIZE)+i%(nstates/VCSIZE)],
-							vc_partial_lh_dad[j], vc_ptn[j]);
-				}
+			// initialize vc_tip_partial_lh
+//			for (j = 0; j < VCSIZE; j++) {
+//				double *lh_dad = lh_states_dad[ptn+j];
+//				for (i = 0; i < nstates/VCSIZE; i++) {
+//					vc_tip_partial_lh[j*(nstates/VCSIZE)+i].load_a(&lh_dad[i*VCSIZE]);
+//				}
+//				vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block]);
+//				vc_ptn[j] = vc_val[0] * vc_tip_partial_lh[j*(nstates/VCSIZE)] * vc_partial_lh_dad[j];
+//			}
+//
+//			// compute vc_ptn
+//			for (i = 1; i < block/VCSIZE; i++)
+//				for (j = 0; j < VCSIZE; j++) {
+//					vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block+i*VCSIZE]);
+//					vc_ptn[j] = mul_add(vc_val[i] * vc_tip_partial_lh[j*(nstates/VCSIZE)+i%(nstates/VCSIZE)],
+//							vc_partial_lh_dad[j], vc_ptn[j]);
+//				}
 
 			vc_freq.load_a(&ptn_freq[ptn]);
 			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
@@ -869,32 +1017,23 @@ double PhyloTree::computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, Ph
 //            cout << "WARNING: Tree log-likelihood is set to " << tree_lh << endl;
         }
 
-		// ascertainment bias correction
 		if (orig_nptn < nptn) {
 			lh_final = 0.0;
 			lh_ptn = 0.0;
 			for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
-				double *partial_lh_dad = &dad_branch->partial_lh[ptn*block];
+//				double *partial_lh_dad = &dad_branch->partial_lh[ptn*block];
 				lh_final += lh_ptn;
-
-				// initialize vc_tip_partial_lh
 				for (j = 0; j < VCSIZE; j++) {
-					double *lh_dad = lh_states_dad[ptn+j];
-					for (i = 0; i < nstates/VCSIZE; i++) {
-						vc_tip_partial_lh[j*(nstates/VCSIZE)+i].load(&lh_dad[i*VCSIZE]); // lh_dad is not aligned!
+					vc_ptn[j] = 0.0;
+					double *partial_lh_dad = dad_branch->partial_lh + (ptn+j)*block;
+					int state_dad = ptn_states_dad[ptn+j];
+					double *lh_node = &partial_lh_node[state_dad*block];
+					for (i = 0; i < block; i+=VCSIZE) {
+						vc_ptn[j] = mul_add(VectorClass().load_a(&lh_node[i]),
+								VectorClass().load_a(&partial_lh_dad[i]), vc_ptn[j]);
 					}
-					vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block]);
-					vc_ptn[j] = vc_val[0] * vc_tip_partial_lh[j*(nstates/VCSIZE)] * vc_partial_lh_dad[j];
 				}
 
-				// compute vc_ptn
-				for (i = 1; i < block/VCSIZE; i++)
-					for (j = 0; j < VCSIZE; j++) {
-						vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block+i*VCSIZE]);
-						vc_ptn[j] = mul_add(vc_val[i] * vc_tip_partial_lh[j*(nstates/VCSIZE)+i%(nstates/VCSIZE)],
-								vc_partial_lh_dad[j], vc_ptn[j]);
-					}
-                    
                 // bugfix 2016-01-21, prob_const can be rescaled
                 for (j = 0; j < VCSIZE; j++)
                     if (dad_branch->scale_num[ptn+j] >= 1)
@@ -911,7 +1050,54 @@ double PhyloTree::computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, Ph
 			default: assert(0); break;
 			}
 		}
-		aligned_free(lh_states_dad);
+		aligned_free(ptn_states_dad);
+		aligned_free(partial_lh_node);
+        
+
+
+		// ascertainment bias correction
+//		if (orig_nptn < nptn) {
+//			lh_final = 0.0;
+//			lh_ptn = 0.0;
+//			for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
+//				double *partial_lh_dad = &dad_branch->partial_lh[ptn*block];
+//				lh_final += lh_ptn;
+//
+//				// initialize vc_tip_partial_lh
+//				for (j = 0; j < VCSIZE; j++) {
+//					double *lh_dad = lh_states_dad[ptn+j];
+//					for (i = 0; i < nstates/VCSIZE; i++) {
+//						vc_tip_partial_lh[j*(nstates/VCSIZE)+i].load(&lh_dad[i*VCSIZE]); // lh_dad is not aligned!
+//					}
+//					vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block]);
+//					vc_ptn[j] = vc_val[0] * vc_tip_partial_lh[j*(nstates/VCSIZE)] * vc_partial_lh_dad[j];
+//				}
+//
+//				// compute vc_ptn
+//				for (i = 1; i < block/VCSIZE; i++)
+//					for (j = 0; j < VCSIZE; j++) {
+//						vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block+i*VCSIZE]);
+//						vc_ptn[j] = mul_add(vc_val[i] * vc_tip_partial_lh[j*(nstates/VCSIZE)+i%(nstates/VCSIZE)],
+//								vc_partial_lh_dad[j], vc_ptn[j]);
+//					}
+//                    
+//                // bugfix 2016-01-21, prob_const can be rescaled
+//                for (j = 0; j < VCSIZE; j++)
+//                    if (dad_branch->scale_num[ptn+j] >= 1)
+//                        vc_ptn[j] = vc_ptn[j] * SCALING_THRESHOLD;
+//
+//				// ptn_invar[ptn] is not aligned
+//				lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
+//			}
+//			switch ((nptn-orig_nptn)%VCSIZE) {
+//			case 0: prob_const = horizontal_add(lh_final+lh_ptn); break;
+//			case 1: prob_const = horizontal_add(lh_final)+lh_ptn[0]; break;
+//			case 2: prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]; break;
+//			case 3: prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]+lh_ptn[2]; break;
+//			default: assert(0); break;
+//			}
+//		}
+//		aligned_free(lh_states_dad);
     } else {
     	// both dad and node are internal nodes
     	VectorClass vc_partial_lh_node[VCSIZE];
@@ -1032,7 +1218,10 @@ double PhyloTree::computeLikelihoodFromBufferEigenSIMD() {
 	double tree_lh = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
 
     size_t ncat = site_rate->getNRate();
-    size_t block = ncat * nstates;
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+
+    size_t block = ncat_mix * nstates;
     size_t ptn; // for big data size > 4GB memory required
     size_t c, i, j;
     size_t orig_nptn = aln->size();
@@ -1044,11 +1233,14 @@ double PhyloTree::computeLikelihoodFromBufferEigenSIMD() {
 	VectorClass *vc_val0 = (VectorClass*)aligned_alloc<double>(block);
 
 	VectorClass vc_len = current_it->length;
-	for (c = 0; c < ncat; c++) {
-		VectorClass vc_rate = site_rate->getRate(c);
-		VectorClass vc_prop = site_rate->getProp(c);
+	for (c = 0; c < ncat_mix; c++) {
+        size_t m = c/denom;
+        double *eval_ptr = eval + (m)*nstates;
+        size_t mycat = c%ncat;
+		VectorClass vc_rate = site_rate->getRate(mycat);
+		VectorClass vc_prop = site_rate->getProp(mycat) * model->getMixtureWeight(m);
 		for (i = 0; i < nstates/VCSIZE; i++) {
-			VectorClass cof = VectorClass().load_a(&eval[i*VCSIZE]) * vc_rate;
+			VectorClass cof = VectorClass().load_a(&eval_ptr[i*VCSIZE]) * vc_rate;
 			VectorClass val = exp(cof*vc_len) * vc_prop;
 			vc_val0[c*nstates/VCSIZE+i] = val;
 		}
@@ -1194,7 +1386,7 @@ double PhyloTree::computeLikelihoodFromBufferEigenSIMD() {
 
     return tree_lh;
 }
-
+*/
 /****************************************************************************
         Highly optimized Parsimony function
  ****************************************************************************/
@@ -1270,8 +1462,8 @@ void PhyloTree::computePartialParsimonyFastSIMD(PhyloNeighbor *dad_branch, Phylo
     if (dad_branch->partial_lh_computed & 2)
         return;
     Node *node = dad_branch->node;
-    int nstates = aln->num_states;
-    int site;
+    int nstates = aln->getMaxNumStates();
+    int site = 0;
     const int VCSIZE = VectorClass::size();
     const int NUM_BITS = VectorClass::size() * UINT_BITS;
 
@@ -1279,130 +1471,144 @@ void PhyloTree::computePartialParsimonyFastSIMD(PhyloNeighbor *dad_branch, Phylo
 
     if (node->isLeaf() && dad) {
         // external node
+        vector<Alignment*> *partitions = NULL;
+        if (aln->isSuperAlignment())
+            partitions = &((SuperAlignment*)aln)->partitions;
+        else {
+            partitions = new vector<Alignment*>;
+            partitions->push_back(aln);
+        }
         if (aln->ordered_pattern.empty())
             aln->orderPatternByNumChars();
         int leafid = node->id;
         int pars_size = getBitsBlockSize();
         memset(dad_branch->partial_pars, 0, pars_size*sizeof(UINT));
-//        int ptn;
-//        int nptn = aln->size();
     	int ambi_aa[] = {2, 3, 5, 6, 9, 10}; // {4+8, 32+64, 512+1024};
-//        int max_sites = ((aln->num_informative_sites+UINT_BITS-1)/UINT_BITS)*UINT_BITS;
-//        UINT *x = dad_branch->partial_pars - (nstates*VCSIZE);
         UINT *x = dad_branch->partial_pars;
-        Alignment::iterator pat;
-    	switch (aln->seq_type) {
-    	case SEQ_DNA:
-            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
-            	int state = pat->at(leafid);
-                int freq = pat->frequency;
-                if (state < 4) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += 4*VCSIZE;
-                            site = 0;
+        int start_pos = 0;
+
+        for (vector<Alignment*>::iterator alnit = partitions->begin(); alnit != partitions->end(); alnit++) {
+            int end_pos = start_pos + (*alnit)->ordered_pattern.size();
+            switch ((*alnit)->seq_type) {
+            case SEQ_DNA:
+                for (int patid = start_pos; patid != end_pos; patid++) {
+                    Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
+                    int state = pat->at(leafid);
+                    int freq = pat->frequency;
+                    if (state < 4) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
                         }
-                        x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
-                    }
-                } else if (state == aln->STATE_UNKNOWN) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += 4*VCSIZE;
-                            site = 0;
+                    } else if (state == (*alnit)->STATE_UNKNOWN) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            UINT bit1 = (1 << (site%UINT_BITS));
+                            UINT *p = x+(site/UINT_BITS);
+                            p[0] |= bit1;
+                            p[VCSIZE] |= bit1;
+                            p[2*VCSIZE] |= bit1;
+                            p[3*VCSIZE] |= bit1;
                         }
-                        UINT bit1 = (1 << (site%UINT_BITS));
-                        UINT *p = x+(site/UINT_BITS);
-                        p[0] |= bit1;
-                        p[VCSIZE] |= bit1;
-                        p[2*VCSIZE] |= bit1;
-                        p[3*VCSIZE] |= bit1;
-                    }
-                } else {
-                	state -= 3;
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += 4*VCSIZE;
-                            site = 0;
+                    } else {
+                        state -= 3;
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            UINT *p = x + ((site/UINT_BITS));
+                            
+                            UINT bit1 = (1 << (site%UINT_BITS));
+                            for (int i = 0; i < 4; i++)
+                                if (state & (1<<i))
+                                    p[i*VCSIZE] |= bit1;
                         }
-                        UINT *p = x + ((site/UINT_BITS));
-                        
-                        UINT bit1 = (1 << (site%UINT_BITS));
-                        for (int i = 0; i < 4; i++)
-                            if (state & (1<<i))
-                                p[i*VCSIZE] |= bit1;
                     }
                 }
-            }
-    		break;
-    	case SEQ_PROTEIN:
-            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
-            	int state = pat->at(leafid);
-                int freq = pat->frequency;
-                if (state < 20) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += 20*VCSIZE;
-                            site = 0;
+                break;
+            case SEQ_PROTEIN:
+                for (int patid = start_pos; patid != end_pos; patid++) {
+                    Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
+                    int state = pat->at(leafid);
+                    int freq = pat->frequency;
+                    if (state < 20) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
                         }
-                        x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
-                    }
-                } else if (state == aln->STATE_UNKNOWN) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += 20*VCSIZE;
-                            site = 0;
+                    } else if (state == (*alnit)->STATE_UNKNOWN) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            UINT bit1 = (1 << (site%UINT_BITS));
+                            UINT *p = x+(site/UINT_BITS);
+                            for (int i = 0; i < 20; i++)
+                                p[i*VCSIZE] |= bit1;
                         }
-                        UINT bit1 = (1 << (site%UINT_BITS));
-                        UINT *p = x+(site/UINT_BITS);
-                        for (int i = 0; i < 20; i++)
-                            p[i*VCSIZE] |= bit1;
-                    }
-                } else {
-                	assert(state < 23);
-            		state = (state-20)*2;
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += 20*VCSIZE;
-                            site = 0;
+                    } else {
+                        assert(state < 23);
+                        state = (state-20)*2;
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            UINT *p = x + ((site/UINT_BITS));
+                            UINT bit1 = (1 << (site%UINT_BITS));
+
+                            p[ambi_aa[state]*VCSIZE] |= bit1;
+                            p[ambi_aa[state+1]*VCSIZE] |= bit1;
                         }
-                        UINT *p = x + ((site/UINT_BITS));
-                        UINT bit1 = (1 << (site%UINT_BITS));
-
-                        p[ambi_aa[state]*VCSIZE] |= bit1;
-                        p[ambi_aa[state+1]*VCSIZE] |= bit1;
                     }
                 }
-            }
-    		break;
-    	default:
-            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
-            	int state = pat->at(leafid);
-                int freq = pat->frequency;
-                if (state < nstates) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += nstates*VCSIZE;
-                            site = 0;
+                break;
+            default:
+                for (int patid = start_pos; patid != end_pos; patid++) {
+                    Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
+                    int state = pat->at(leafid);
+                    int freq = pat->frequency;
+                    if (state < (*alnit)->num_states) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
                         }
-                        x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
-                    }
-                } else if (state == aln->STATE_UNKNOWN) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += nstates*VCSIZE;
-                            site = 0;
+                    } else if (state == (*alnit)->STATE_UNKNOWN) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            UINT bit1 = (1 << (site%UINT_BITS));
+                            UINT *p = x+(site/UINT_BITS);
+                            for (int i = 0; i < (*alnit)->num_states; i++)
+                                p[i*VCSIZE] |= bit1;
                         }
-                        UINT bit1 = (1 << (site%UINT_BITS));
-                        UINT *p = x+(site/UINT_BITS);
-                        for (int i = 0; i < nstates; i++)
-                            p[i*VCSIZE] |= bit1;
+                    } else {
+                        assert(0);
                     }
-                } else {
-                	assert(0);
                 }
-            }
-    		break;
-    	}
+                break;
+            } // end of switch
+            start_pos = end_pos;
+        } // of end FOR LOOP
+
+        assert(start_pos == aln->ordered_pattern.size());
+//        assert(site == aln->num_informative_sites % NUM_BITS);
         // add dummy states
         if (site > 0 && site < NUM_BITS) {
             x += site/UINT_BITS;
@@ -1411,6 +1617,8 @@ void PhyloTree::computePartialParsimonyFastSIMD(PhyloNeighbor *dad_branch, Phylo
             int max_sites = ((site+UINT_BITS-1)/UINT_BITS);
             memset(x, 255, (VCSIZE - max_sites)*sizeof(UINT));
         }
+        if (!aln->isSuperAlignment())
+            delete partitions;
     } else {
         // internal node
         assert(node->degree() == 3); // it works only for strictly bifurcating tree
@@ -1433,7 +1641,7 @@ void PhyloTree::computePartialParsimonyFastSIMD(PhyloNeighbor *dad_branch, Phylo
             #pragma omp parallel for private (site) reduction(+: score) if(nsites>200)
             #endif
 			for (site = 0; site<nsites; site++) {
-                size_t offset = 4*VCSIZE*site;
+                size_t offset = entry_size*site;
                 VectorClass *x = (VectorClass*)(left->partial_pars + offset);
                 VectorClass *y = (VectorClass*)(right->partial_pars + offset);
                 VectorClass *z = (VectorClass*)(dad_branch->partial_pars + offset);
@@ -1504,7 +1712,7 @@ int PhyloTree::computeParsimonyBranchFastSIMD(PhyloNeighbor *dad_branch, PhyloNo
     if ((node_branch->partial_lh_computed & 2) == 0)
         computePartialParsimonyFastSIMD<VectorClass>(node_branch, node);
     int site;
-    int nstates = aln->num_states;
+    int nstates = aln->getMaxNumStates();
 
 //    VectorClass score = 0;
 //    VectorClass w;
diff --git a/phylokernelavx512.cpp b/phylokernelavx512.cpp
new file mode 100644
index 0000000..3c1e86a
--- /dev/null
+++ b/phylokernelavx512.cpp
@@ -0,0 +1,120 @@
+/*
+ * phylokernelavx512.cpp
+ *
+ *  Created on: Sept 25, 2016
+ *      Author: minh
+ */
+
+
+#define MAX_VECTOR_SIZE 512 // for VectorClass
+
+#include "vectorclass/vectorclass.h"
+#include "vectorclass/vectormath_exp.h"
+#include "phylokernel.h"
+//#include "phylokernelsafe.h"
+//#include "phylokernelmixture.h"
+//#include "phylokernelmixrate.h"
+//#include "phylokernelsitemodel.h"
+
+#include "phylokernelnew.h"
+#define KERNEL_FIX_STATES
+#include "phylokernelnew.h"
+
+
+#if !defined ( __AVX512F__ ) && !defined ( __AVX512__ )
+#error "You must compile this file with AVX512 enabled!"
+#endif
+
+void PhyloTree::setDotProductAVX512() {
+#ifdef BOOT_VAL_FLOAT
+		dotProduct = &PhyloTree::dotProductSIMD<float, Vec16f>;
+#else
+		dotProduct = &PhyloTree::dotProductSIMD<double, Vec8d>;
+#endif
+        dotProductDouble = &PhyloTree::dotProductSIMD<double, Vec8d>;
+}
+
+void PhyloTree::setLikelihoodKernelAVX512() {
+//    setParsimonyKernelAVX();
+    if (model_factory && model_factory->model->isSiteSpecificModel()) {
+        switch (aln->num_states) {
+        case 4:
+            computeLikelihoodBranchPointer     = &PhyloTree::computeLikelihoodBranchSIMD    <Vec8d, NORM_LH, 4, true, true>;
+            computeLikelihoodDervPointer       = &PhyloTree::computeLikelihoodDervSIMD      <Vec8d, NORM_LH, 4, true, true>;
+            computePartialLikelihoodPointer    =  &PhyloTree::computePartialLikelihoodSIMD  <Vec8d, NORM_LH, 4, true, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec8d, NORM_LH, 4, true, true>;
+            break;
+        case 20:
+            computeLikelihoodBranchPointer     = &PhyloTree::computeLikelihoodBranchSIMD    <Vec8d, NORM_LH, 20, true, true>;
+            computeLikelihoodDervPointer       = &PhyloTree::computeLikelihoodDervSIMD      <Vec8d, NORM_LH, 20, true, true>;
+            computePartialLikelihoodPointer    = &PhyloTree::computePartialLikelihoodSIMD   <Vec8d, NORM_LH, 20, true, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec8d, NORM_LH, 20, true, true>;
+            break;
+        default:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD        <Vec8d, NORM_LH, true, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD            <Vec8d, NORM_LH, true, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD      <Vec8d, NORM_LH, true, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec8d, NORM_LH, true, true>;
+            break;
+        }
+        return;
+    }
+
+    if (params->lk_safe_scaling) {
+	switch(aln->num_states) {
+        case 2:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec8d, SAFE_LH, 2, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec8d, SAFE_LH, 2, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec8d, SAFE_LH, 2, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec8d, SAFE_LH, 2, true>;
+            break;
+        case 4:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec8d, SAFE_LH, 4, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec8d, SAFE_LH, 4, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec8d, SAFE_LH, 4, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec8d, SAFE_LH, 4, true>;
+            break;
+        case 20:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec8d, SAFE_LH, 20, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec8d, SAFE_LH, 20, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec8d, SAFE_LH, 20, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec8d, SAFE_LH, 20, true>;
+            break;
+        default:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD<Vec8d, SAFE_LH, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD<Vec8d, SAFE_LH, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD<Vec8d, SAFE_LH, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec8d, SAFE_LH, true>;
+            break;
+        }
+        return;
+    }
+
+	switch(aln->num_states) {
+	case 2:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec8d, NORM_LH, 2, true>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec8d, NORM_LH, 2, true>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec8d, NORM_LH, 2, true>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec8d, NORM_LH, 2, true>;
+		break;
+	case 4:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec8d, NORM_LH, 4, true>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec8d, NORM_LH, 4, true>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec8d, NORM_LH, 4, true>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec8d, NORM_LH, 4, true>;
+		break;
+	case 20:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec8d, NORM_LH, 20, true>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec8d, NORM_LH, 20, true>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec8d, NORM_LH, 20, true>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec8d, NORM_LH, 20, true>;
+		break;
+	default:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD<Vec8d, NORM_LH, true>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD<Vec8d, NORM_LH, true>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD<Vec8d, NORM_LH, true>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec8d, NORM_LH, true>;
+		break;
+	}
+}
+
diff --git a/phylokernelfma.cpp b/phylokernelfma.cpp
new file mode 100644
index 0000000..1e9909c
--- /dev/null
+++ b/phylokernelfma.cpp
@@ -0,0 +1,164 @@
+/*
+ * phylokernelfma.cpp
+ *
+ *  Created on: Sept 25, 2016
+ *      Author: minh
+ */
+
+
+#include "vectorclass/vectormath_exp.h"
+#include "vectorclass/vectorclass.h"
+#include "phylokernel.h"
+//#include "phylokernelsafe.h"
+//#include "phylokernelmixture.h"
+//#include "phylokernelmixrate.h"
+//#include "phylokernelsitemodel.h"
+
+#include "phylokernelnew.h"
+#define KERNEL_FIX_STATES
+#include "phylokernelnew.h"
+
+#if !defined(__AVX2__) && !defined(__FMA__)
+#error "You must compile this file with AVX2 or FMA enabled!"
+#endif
+
+void PhyloTree::setDotProductFMA() {
+#ifdef BOOT_VAL_FLOAT
+		dotProduct = &PhyloTree::dotProductSIMD<float, Vec8f>;
+#else
+		dotProduct = &PhyloTree::dotProductSIMD<double, Vec4d>;
+#endif
+        dotProductDouble = &PhyloTree::dotProductSIMD<double, Vec4d>;
+}
+
+void PhyloTree::setLikelihoodKernelFMA() {
+    vector_size = 4;
+//    setParsimonyKernelAVX();
+
+    if (model_factory && model_factory->model->isSiteSpecificModel() && (params->lk_safe_scaling || leafNum >= params->numseq_safe_scaling)) {
+    	// safe site-specific model
+        switch (aln->num_states) {
+        case 4:
+            computeLikelihoodBranchPointer     = &PhyloTree::computeLikelihoodBranchSIMD    <Vec4d, SAFE_LH, 4, true, true>;
+            computeLikelihoodDervPointer       = &PhyloTree::computeLikelihoodDervSIMD      <Vec4d, SAFE_LH, 4, true, true>;
+            computePartialLikelihoodPointer    =  &PhyloTree::computePartialLikelihoodSIMD  <Vec4d, SAFE_LH, 4, true, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, SAFE_LH, 4, true, true>;
+            break;
+        case 20:
+            computeLikelihoodBranchPointer     = &PhyloTree::computeLikelihoodBranchSIMD    <Vec4d, SAFE_LH, 20, true, true>;
+            computeLikelihoodDervPointer       = &PhyloTree::computeLikelihoodDervSIMD      <Vec4d, SAFE_LH, 20, true, true>;
+            computePartialLikelihoodPointer    = &PhyloTree::computePartialLikelihoodSIMD   <Vec4d, SAFE_LH, 20, true, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, SAFE_LH, 20, true, true>;
+            break;
+        default:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD        <Vec4d, SAFE_LH, true, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD            <Vec4d, SAFE_LH, true, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD      <Vec4d, SAFE_LH, true, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec4d, SAFE_LH, true, true>;
+            break;
+        }
+        return;
+    }
+
+    if (model_factory && model_factory->model->isSiteSpecificModel()) {
+        switch (aln->num_states) {
+        case 4:
+            computeLikelihoodBranchPointer     = &PhyloTree::computeLikelihoodBranchSIMD    <Vec4d, NORM_LH, 4, true, true>;
+            computeLikelihoodDervPointer       = &PhyloTree::computeLikelihoodDervSIMD      <Vec4d, NORM_LH, 4, true, true>;
+            computePartialLikelihoodPointer    =  &PhyloTree::computePartialLikelihoodSIMD  <Vec4d, NORM_LH, 4, true, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, NORM_LH, 4, true, true>;
+            break;
+        case 20:
+            computeLikelihoodBranchPointer     = &PhyloTree::computeLikelihoodBranchSIMD    <Vec4d, NORM_LH, 20, true, true>;
+            computeLikelihoodDervPointer       = &PhyloTree::computeLikelihoodDervSIMD      <Vec4d, NORM_LH, 20, true, true>;
+            computePartialLikelihoodPointer    = &PhyloTree::computePartialLikelihoodSIMD   <Vec4d, NORM_LH, 20, true, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, NORM_LH, 20, true, true>;
+            break;
+        default:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD        <Vec4d, NORM_LH, true, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD            <Vec4d, NORM_LH, true, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD      <Vec4d, NORM_LH, true, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec4d, NORM_LH, true, true>;
+            break;
+        }
+        return;
+    }
+
+    if (params->lk_safe_scaling || leafNum >= params->numseq_safe_scaling) {
+	switch(aln->num_states) {
+        /*
+        case 2:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, SAFE_LH, 2, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, SAFE_LH, 2, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, SAFE_LH, 2, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, SAFE_LH, 2, true>;
+            break;
+        */
+        case 4:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, SAFE_LH, 4, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, SAFE_LH, 4, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, SAFE_LH, 4, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, SAFE_LH, 4, true>;
+            break;
+        case 20:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, SAFE_LH, 20, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, SAFE_LH, 20, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, SAFE_LH, 20, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, SAFE_LH, 20, true>;
+            break;
+        /*
+        case 64:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, SAFE_LH, 64, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, SAFE_LH, 64, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, SAFE_LH, 64, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, SAFE_LH, 64, true>;
+            break;
+        */
+        default:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD<Vec4d, SAFE_LH, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD<Vec4d, SAFE_LH, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD<Vec4d, SAFE_LH, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec4d, SAFE_LH, true>;
+            break;
+        }
+        return;
+    }
+
+	switch(aln->num_states) {
+    /*
+	case 2:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, NORM_LH, 2, true>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, NORM_LH, 2, true>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, NORM_LH, 2, true>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, NORM_LH, 2, true>;
+		break;
+    */
+	case 4:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, NORM_LH, 4, true>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, NORM_LH, 4, true>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, NORM_LH, 4, true>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, NORM_LH, 4, true>;
+		break;
+	case 20:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, NORM_LH, 20, true>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, NORM_LH, 20, true>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, NORM_LH, 20, true>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, NORM_LH, 20, true>;
+		break;
+    /*
+	case 64:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, NORM_LH, 64, true>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, NORM_LH, 64, true>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, NORM_LH, 64, true>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, NORM_LH, 64, true>;
+		break;
+    */
+	default:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD<Vec4d, NORM_LH, true>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD<Vec4d, NORM_LH, true>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD<Vec4d, NORM_LH, true>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec4d, NORM_LH, true>;
+		break;
+	}
+}
+
diff --git a/phylokernelmixrate.h b/phylokernelmixrate.h
index a92559f..bdabd64 100644
--- a/phylokernelmixrate.h
+++ b/phylokernelmixrate.h
@@ -23,7 +23,7 @@ template <class VectorClass, const int VCSIZE, const int nstates>
 void PhyloTree::computeMixratePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
     if (dad_branch->node->degree() > 3) {
         // TODO: SIMD version for multifurcating node
-        computeMixratePartialLikelihoodEigen(dad_branch, dad);
+        computePartialLikelihoodEigen(dad_branch, dad);
         return;
     }
 
diff --git a/phylokernelmixture.h b/phylokernelmixture.h
index 6b08498..992d4a8 100644
--- a/phylokernelmixture.h
+++ b/phylokernelmixture.h
@@ -21,7 +21,7 @@ template <class VectorClass, const int VCSIZE, const int nstates>
 void PhyloTree::computeMixturePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
     if (dad_branch->node->degree() > 3) {
         // TODO: SIMD version for multifurcating node
-        computeMixturePartialLikelihoodEigen(dad_branch, dad);
+        computePartialLikelihoodEigen(dad_branch, dad);
         return;
     }
 
diff --git a/phylokernelnew.h b/phylokernelnew.h
new file mode 100644
index 0000000..11c29eb
--- /dev/null
+++ b/phylokernelnew.h
@@ -0,0 +1,2802 @@
+/*
+ * phylokernelnew.h
+ * Newly revised kernel based on vectorizing over alignment patterns
+ *
+ *  Created on: Sept 23, 2016
+ *      Author: minh
+ */
+
+
+#if !defined(PHYLOKERNELNEW_H_) || !defined(PHYLOKERNELNEW_STATE_H_)
+
+#ifdef KERNEL_FIX_STATES
+#   define PHYLOKERNELNEW_STATE_H_
+#else
+#   define PHYLOKERNELNEW_H_
+#endif
+
+#include "phylotree.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+//#include <thread>
+
+using namespace std;
+
+/*******************************************************
+ *
+ * Helper function for vectors and matrix multiplication
+ *
+ ******************************************************/
+
+/**
+    sum of elments of a vector:
+    X = A[0] + ... + A[N-1]
+    template FMA = true to allow FMA instruction, false otherwise
+    @param N number of elements
+    @param A vector of size N
+    @param[out] X sum of elements of A
+*/
+#ifndef KERNEL_FIX_STATES
+template <class VectorClass, const bool append>
+inline void sumVec(VectorClass *A, VectorClass &X, size_t N)
+{
+    if (N == 1) {
+        X = A[0];
+        return;
+    }
+
+    size_t i;
+    switch (N % 4) {
+    case 0: {
+        VectorClass V[4];
+        V[0] = A[0];
+        V[1] = A[1];
+        V[2] = A[2];
+        V[3] = A[3];
+        for (i = 4; i < N; i+=4) {
+            V[0] += A[i];
+            V[1] += A[i+1];
+            V[2] += A[i+2];
+            V[2] += A[i+3];
+        }
+        if (append)
+            X += (V[0] + V[1]) + (V[2] + V[3]);
+        else
+            X = (V[0] + V[1]) + (V[2] + V[3]);
+        break;
+    }
+
+    case 2: {
+        VectorClass V[2];
+        V[0] = A[0];
+        V[1] = A[1];
+        for (i = 2; i < N; i+=2) {
+            V[0] += A[i];
+            V[1] += A[i+1];
+        }
+        if (append)
+            X += V[0] + V[1];
+        else
+            X = V[0] + V[1];
+        break;
+    }
+
+    default: {
+        VectorClass V[2];
+        // odd N
+        V[0] = A[0];
+        V[1] = A[1];
+        for (i = 2; i < N-1; i+=2) {
+            V[0] += A[i];
+            V[1] += A[i+1];
+        }
+        if (append)
+            X += A[N-1] + V[0] + V[1];
+        else
+            X = A[N-1] + V[0] + V[1];
+        break;
+    }
+    }
+}
+#endif
+
+/**
+    dotProduct of two vectors A, B
+    X = A.B = A[0]*B[0] + ... + A[N-1]*B[N-1]
+    template FMA = true to allow FMA instruction, false otherwise
+    @param N number of elements
+    @param A first vector of size N
+    @param B second vector of size N
+    @param[out] X dot-product of A and B
+*/
+#ifdef KERNEL_FIX_STATES
+template <class VectorClass, class Numeric, const size_t N, const bool FMA>
+inline void dotProductVec(Numeric *A, VectorClass *B, VectorClass &X)
+#else
+template <class VectorClass, class Numeric, const bool FMA>
+inline void dotProductVec(Numeric *A, VectorClass *B, VectorClass &X, size_t N)
+#endif
+{
+    size_t i, j;
+    switch (N % 4) {
+    case 0: {
+        VectorClass V[4];
+        for (j = 0; j < 4; j++)
+            V[j] = A[j] * B[j];
+        for (i = 4; i < N; i+=4) {
+            for (j = 0; j < 4; j++)
+                V[j] = mul_add(A[i+j], B[i+j], V[j]);
+        }
+        X = (V[0]+V[1]) + (V[2]+V[3]);
+        break;
+    }
+
+    case 2: {
+        VectorClass V[2];
+        for (j = 0; j < 2; j++)
+            V[j] = A[j] * B[j];
+        for (i = 2; i < N; i+=2) {
+            for (j = 0; j < 2; j++)
+                V[j] = mul_add(A[i+j], B[i+j], V[j]);
+        }
+        X = (V[0]+V[1]);
+        break;
+    }
+
+    default: {
+        // odd number of states
+        VectorClass V[2];
+        for (j = 0; j < 2; j++)
+            V[j] = A[j] * B[j];
+        for (i = 2; i < N-1; i+=2) {
+            for (j = 0; j < 2; j++)
+                V[j] = mul_add(A[i+j], B[i+j], V[j]);
+        }
+        X = mul_add(A[N-1], B[N-1], V[0]+V[1]);
+        break;
+    }
+    }
+}
+
+/**
+    Dual dotProduct of four vectors A, B, C, D to compute X:
+    X = (A.B) * (C.D), where
+    A.B = A[0]*B[0] + ... + A[N-1]*B[N-1]
+    C.D = C[0]*D[0] + ... + C[N-1]*D[N-1]
+    template FMA = true to allow FMA instruction, false otherwise
+    @param N number of elements
+    @param A first vector of size N
+    @param B second vector of size N
+    @param C third vector of size N
+    @param D fourth vector of size N
+    @param[out] X = (A.B) * (C.D)
+*/
+#ifdef KERNEL_FIX_STATES
+template <class VectorClass, class Numeric, const size_t N, const bool FMA>
+inline void dotProductDualVec(Numeric *A, VectorClass *B, Numeric *C, VectorClass *D, VectorClass &X)
+#else
+template <class VectorClass, class Numeric, const bool FMA>
+inline void dotProductDualVec(Numeric *A, VectorClass *B, Numeric *C, VectorClass *D, VectorClass &X, size_t N)
+#endif
+{
+    size_t i, j;
+    switch (N % 4) {
+    case 0: {
+        VectorClass AB[4], CD[4];
+        for (j = 0; j < 4; j++) {
+            AB[j] = A[j] * B[j];
+            CD[j] = C[j] * D[j];
+        }
+        for (i = 4; i < N; i+=4) {
+
+            for (j = 0; j < 4; j++) {
+                AB[j] = mul_add(A[i+j],  B[i+j],  AB[j]);
+                CD[j] = mul_add(C[i+j], D[i+j], CD[j]);
+            }
+        }
+        X = ((AB[0]+AB[1])+(AB[2]+AB[3])) * ((CD[0]+CD[1])+CD[2]+CD[3]);
+        break;
+    }
+
+    case 2: {
+        VectorClass AB[2], CD[2];
+        for (j = 0; j < 2; j++) {
+            AB[j] = A[j] * B[j];
+            CD[j] = C[j] * D[j];
+        }
+        for (i = 2; i < N; i+=2) {
+            for (j = 0; j < 2; j++) {
+                AB[j] = mul_add(A[i+j],  B[i+j],  AB[j]);
+                CD[j] = mul_add(C[i+j], D[i+j], CD[j]);
+            }
+        }
+        X = ((AB[0]+AB[1])) * ((CD[0]+CD[1]));
+        break;
+    }
+
+    default: {
+        // odd states
+        VectorClass AB[2], CD[2];
+        for (j = 0; j < 2; j++) {
+            AB[j] = A[j] * B[j];
+            CD[j] = C[j] * D[j];
+        }
+        for (i = 2; i < N-1; i+=2) {
+            for (j = 0; j < 2; j++) {
+                AB[j] = mul_add(A[i+j],  B[i+j],  AB[j]);
+                CD[j] = mul_add(C[i+j], D[i+j], CD[j]);
+            }
+        }
+        AB[0] = mul_add(A[N-1], B[N-1], AB[0]+AB[1]);
+        CD[0] = mul_add(C[N-1], D[N-1], CD[0]+CD[1]);
+        X = AB[0] * CD[0];
+        break;
+    }
+    }
+}
+
+/**
+    compute product of a vector A and a matrix M, resulting in a vector X:
+    X[i] = A[0]*M[i,0] + ... + A[N-1]*M[i,N-1], for all i = 0,...,N-1
+    @param N number of elements
+    @param A input vector of size N
+    @param M input matrix of size N*N
+    @param[out] X output vector of size N
+*/
+// quick unrolling version of multiplying partial_lh with inv_eigenvector
+#ifdef KERNEL_FIX_STATES
+template <class VectorClass, class Numeric, const size_t N, const bool FMA>
+inline void productVecMat(VectorClass *A, Numeric *M, VectorClass *X)
+#else
+template <class VectorClass, class Numeric, const bool FMA>
+inline void productVecMat(VectorClass *A, Numeric *M, VectorClass *X, size_t N)
+#endif
+{
+    size_t i, j, x;
+
+    switch (N % 4) {
+    case 0:
+        for (i = 0; i < N; i++) {
+            // manual unrolling
+            VectorClass V[4];
+            for (j = 0; j < 4; j++)
+                V[j] = A[j] * M[j];
+
+            for (x = 4; x < N; x+=4) {
+                for (j = 0; j < 4; j++)
+                    V[j] = mul_add(A[x+j], M[x+j], V[j]);
+            }
+            X[i] = (V[0]+V[1])+(V[2]+V[3]);
+            M += N;
+        }
+        break;
+
+    case 2:
+        for (i = 0; i < N; i++) {
+            // manual unrolling
+            VectorClass V[2];
+            for (j = 0; j < 2; j++)
+                V[j] = A[j] * M[j];
+
+            for (x = 2; x < N; x+=2) {
+                for (j = 0; j < 2; j++)
+                    V[j] = mul_add(A[x+j], M[x+j], V[j]);
+            }
+            X[i] = (V[0]+V[1]);
+            M += N;
+        }
+        break;
+    default:
+        // odd number of states
+        for (i = 0; i < N; i++) {
+            // manual unrolling
+            VectorClass V[2];
+            for (j = 0; j < 2; j++)
+                V[j] = A[j] * M[j];
+
+            for (x = 2; x < N-1; x+=2) {
+                for (j = 0; j < 2; j++)
+                    V[j] = mul_add(A[x+j], M[x+j], V[j]);
+            }
+            X[i] = mul_add(A[N-1], M[N-1], V[0]+V[1]);
+            M += N;
+        }
+        break;
+    }
+}
+
+
+/**
+    compute product of a vector A and a matrix M, resulting in a vector X:
+    X[i] = A[0]*M[i,0] + ... + A[N-1]*M[i,N-1], for all i = 0,...,N-1
+    and also return the maximum of absolute values of X
+    @param N number of elements
+    @param A input vector of size N
+    @param M input matrix of size N*N
+    @param[out] X output vector of size N
+    @param[out] Xmax max of |X[i]|
+*/
+// quick unrolling version of multiplying partial_lh with inv_eigenvector
+#ifdef KERNEL_FIX_STATES
+template <class VectorClass, class Numeric, const size_t N, const bool FMA>
+inline void productVecMat(VectorClass *A, Numeric *M, VectorClass *X, VectorClass &Xmax)
+#else
+template <class VectorClass, class Numeric, const bool FMA>
+inline void productVecMat(VectorClass *A, Numeric *M, VectorClass *X, VectorClass &Xmax, size_t N)
+#endif
+{
+    size_t i, j, x;
+
+    switch (N % 4) {
+    case 0:
+        for (i = 0; i < N; i++) {
+            // manual unrolling
+            VectorClass V[4];
+            for (j = 0; j < 4; j++)
+                V[j] = A[j] * M[j];
+
+            for (x = 4; x < N; x+=4) {
+                for (j = 0; j < 4; j++)
+                    V[j] = mul_add(A[x+j], M[x+j], V[j]);
+            }
+            X[i] = (V[0]+V[1])+(V[2]+V[3]);
+            M += N;
+            Xmax = max(Xmax, abs(X[i]));
+        }
+        break;
+
+    case 2:
+        for (i = 0; i < N; i++) {
+            // manual unrolling
+            VectorClass V[2];
+            for (j = 0; j < 2; j++)
+                V[j] = A[j] * M[j];
+
+            for (x = 2; x < N; x+=2) {
+                for (j = 0; j < 2; j++)
+                    V[j] = mul_add(A[x+j], M[x+j], V[j]);
+            }
+            X[i] = (V[0]+V[1]);
+            M += N;
+            Xmax = max(Xmax, abs(X[i]));
+        }
+        break;
+
+    default:
+        // odd number of states
+        for (i = 0; i < N; i++) {
+            // manual unrolling
+            VectorClass V[2];
+            for (j = 0; j < 2; j++)
+                V[j] = A[j] * M[j];
+
+            for (x = 2; x < N-1; x+=2) {
+                for (j = 0; j < 2; j++)
+                    V[j] = mul_add(A[x+j], M[x+j], V[j]);
+            }
+            X[i] = mul_add(A[N-1], M[N-1], V[0]+V[1]);
+            M += N;
+            Xmax = max(Xmax, abs(X[i]));
+        }
+        break;
+    }
+}
+
+/**
+    compute dot-products of 3 vectors A, B, C with a single vector D and returns X, Y, Z:
+    X =   A.D = A[0]*D[0] + ... + A[N-1]*D[N-1]
+    Y =   B.D = B[0]*D[0] + ... + B[N-1]*D[N-1]
+    Z =   C.D = C[0]*D[0] + ... + C[N-1]*D[N-1]
+    @param N number of elements
+    @param nstates number of states
+    @param A vector of size N
+    @param B vector of size N
+    @param C vector of size N
+    @param D vector of size N
+    @param[in/out] X = A.D
+    @param[out] Y = B.D
+    @param[out] Z = C.D
+*/
+#ifdef KERNEL_FIX_STATES
+template <class VectorClass, class Numeric, const size_t nstates, const bool FMA>
+inline void dotProductTriple(Numeric *A, Numeric *B, Numeric *C, VectorClass *D,
+    VectorClass &X, VectorClass &Y, VectorClass &Z, size_t N)
+#else
+template <class VectorClass, class Numeric, const bool FMA>
+inline void dotProductTriple(Numeric *A, Numeric *B, Numeric *C, VectorClass *D,
+    VectorClass &X, VectorClass &Y, VectorClass &Z, size_t N, size_t nstates)
+#endif
+{
+    size_t i, j;
+    if (nstates % 2 == 0) {
+        VectorClass AD[2], BD[2], CD[2];
+        for (j = 0; j < 2; j++) {
+            AD[j] = A[j] * D[j];
+            BD[j] = B[j] * D[j];
+            CD[j] = C[j] * D[j];
+        }
+		for (i = 2; i < N; i+=2) {
+            for (j = 0; j < 2; j++) {
+                AD[j] = mul_add(A[i+j], D[i+j], AD[j]);
+                BD[j] = mul_add(B[i+j], D[i+j], BD[j]);
+                CD[j] = mul_add(C[i+j], D[i+j], CD[j]);
+            }
+		}
+        X  = AD[0] + AD[1];
+        Y  = BD[0] + BD[1];
+        Z  = CD[0] + CD[1];
+    } else {
+        // odd states
+        VectorClass AD[2], BD[2], CD[2];
+        for (j = 0; j < 2; j++) {
+            AD[j] = A[j] * D[j];
+            BD[j] = B[j] * D[j];
+            CD[j] = C[j] * D[j];
+        }
+		for (i = 2; i < N-1; i+=2) {
+            for (j = 0; j < 2; j++) {
+                AD[j] = mul_add(A[i+j], D[i+j], AD[j]);
+                BD[j] = mul_add(B[i+j], D[i+j], BD[j]);
+                CD[j] = mul_add(C[i+j], D[i+j], CD[j]);
+            }
+		}
+        X  = mul_add(A[N-1], D[N-1], AD[0] + AD[1]);
+        Y  = mul_add(B[N-1], D[N-1], BD[0] + BD[1]);
+        Z  = mul_add(C[N-1], D[N-1], CD[0] + CD[1]);
+    }
+}
+
+
+/**
+    Given three vectors A, B, C, compute X:
+    X = A.B.C = A[0]*B[0]*C[0] + ... + A[N-1]*B[N-1]*C[N-1]
+    @param N number of elements
+    @param A vector of size N
+    @param B vector of size N
+    @param C vector of size N
+    @param[out] X = A.B.C
+*/
+#ifdef KERNEL_FIX_STATES
+template <class VectorClass, class Numeric, const size_t N, const bool FMA>
+inline void dotProduct3Vec(Numeric *A, VectorClass *B, VectorClass *C, VectorClass &X)
+#else
+template <class VectorClass, class Numeric, const bool FMA>
+inline void dotProduct3Vec(Numeric *A, VectorClass *B, VectorClass *C, VectorClass &X, size_t N)
+#endif
+{
+    size_t i, j;
+    switch (N % 4) {
+    case 0: {
+        VectorClass V[4];
+        for (j = 0; j < 4; j++)
+            V[j] = A[j] * B[j] * C[j];
+        for (i = 4; i < N; i+=4)
+            for (j = 0; j < 4; j++)
+                V[j] = mul_add(A[i+j]*B[i+j], C[i+j], V[j]);
+        X = (V[0]+V[1])+(V[2]+V[3]);
+        break;
+    }
+
+    case 2: {
+        VectorClass V[2];
+        for (j = 0; j < 2; j++)
+            V[j] = A[j] * B[j] * C[j];
+        for (i = 2; i < N; i+=2)
+            for (j = 0; j < 2; j++)
+                V[j] = mul_add(A[i+j]*B[i+j], C[i+j], V[j]);
+        X = (V[0]+V[1]);
+        break;
+    }
+
+    default: {
+        // odd states
+        VectorClass V[2];
+        for (j = 0; j < 2; j++)
+            V[j] = A[j] * B[j] * C[j];
+        for (i = 2; i < N-1; i+=2)
+            for (j = 0; j < 2; j++)
+                V[j] = mul_add(A[i+j]*B[i+j], C[i+j], V[j]);
+        X = mul_add(A[N-1]*B[N-1], C[N-1], V[0]+V[1]);
+        break;
+    }
+    }
+}
+
+
+/**
+    given three vectors A, B, C and a numeric coefficient D, compute X:
+    X = exp(A[0]*D)*B[0]*C[0] + ... exp(A[N-1]*D)*B[N-1]*C[N-1]
+    @param N number of elements
+    @param A vector of size N
+    @param B vector of size N
+    @param C vector of size N
+    @param D coefficient for A
+    @param[out] X = exp(A[0]*D)*B[0]*C[0] + ... exp(A[N-1]*D)*B[N-1]*C[N-1]
+*/
+#ifdef KERNEL_FIX_STATES
+template <class VectorClass, class Numeric, const size_t N, const bool FMA>
+inline void dotProductExp(VectorClass *A, VectorClass *B, VectorClass *C, Numeric D, VectorClass &X)
+#else
+template <class VectorClass, class Numeric, const bool FMA>
+inline void dotProductExp(VectorClass *A, VectorClass *B, VectorClass *C, Numeric D, VectorClass &X, size_t N)
+#endif
+{
+    size_t i;
+    X = exp(A[0]*D)*B[0]*C[0];
+    for (i = 1; i < N; i++)
+        X = mul_add(exp(A[i]*D), B[i]*C[i], X);
+}
+
+
+/**
+    given two vectors A, B and a numeric coefficient D, compute X:
+    X = exp(A[0]*D)*B[0] + ... exp(A[N-1]*D)*B[N-1]
+    @param N number of elements
+    @param A vector of size N
+    @param B vector of size N
+    @param D coefficient for A
+    @param[out] X = exp(A[0]*D)*B[0] + ... exp(A[N-1]*D)*B[N-1]
+*/
+#ifdef KERNEL_FIX_STATES
+template <class VectorClass, class Numeric, const size_t N, const bool FMA>
+inline void dotProductExp(VectorClass *A, VectorClass *B, Numeric D, VectorClass &X)
+#else
+template <class VectorClass, class Numeric, const bool FMA>
+inline void dotProductExp(VectorClass *A, VectorClass *B, Numeric D, VectorClass &X, size_t N)
+#endif
+{
+    size_t i;
+    X = exp(A[0]*D)*B[0];
+    for (i = 1; i < N; i++)
+        X = mul_add(exp(A[i]*D), B[i], X);
+}
+
+#ifdef KERNEL_FIX_STATES
+template <class VectorClass, const bool SAFE_NUMERIC, const size_t nstates>
+inline void scaleLikelihood(VectorClass &lh_max, double *invar, double *dad_partial_lh, UBYTE *dad_scale_num,
+    size_t ncat_mix)
+#else
+template <class VectorClass, const bool SAFE_NUMERIC>
+inline void scaleLikelihood(VectorClass &lh_max, double *invar, double *dad_partial_lh, UBYTE *dad_scale_num,
+    size_t ncat_mix, size_t nstates)
+#endif
+{
+    if (SAFE_NUMERIC) {
+        size_t x, i;
+        auto underflown = ((lh_max < SCALING_THRESHOLD) & (lh_max != 0.0) & (VectorClass().load_a(invar) == 0.0));
+        if (horizontal_or(underflown)) { // at least one site has numerical underflown
+            for (x = 0; x < VectorClass::size(); x++)
+            if (underflown[x]) {
+                // BQM 2016-05-03: only scale for non-constant sites
+                // now do the likelihood scaling
+                double *partial_lh = &dad_partial_lh[x];
+                for (i = 0; i < nstates; i++)
+                    partial_lh[i*VectorClass::size()] *= SCALING_THRESHOLD_INVER;
+                dad_scale_num[x*ncat_mix] += 1;
+            }
+        }
+    } else {
+        size_t x, i;
+        auto underflown = (lh_max < SCALING_THRESHOLD) & (lh_max != 0.0) & (VectorClass().load_a(invar) == 0.0);
+        if (horizontal_or(underflown)) { // at least one site has numerical underflown
+            size_t block = ncat_mix * nstates;
+            for (x = 0; x < VectorClass::size(); x++)
+            if (underflown[x]) {
+                double *partial_lh = &dad_partial_lh[x];
+                // now do the likelihood scaling
+                for (i = 0; i < block; i++) {
+                    partial_lh[i*VectorClass::size()] *= SCALING_THRESHOLD_INVER;
+                }
+                dad_scale_num[x] += 1;
+            }
+        }
+    }
+}
+
+
+/*******************************************************
+ *
+ * Helper function to pre-compute traversal information
+ * and buffer to transition matrix
+ *
+ ******************************************************/
+
+#ifndef KERNEL_FIX_STATES
+inline bool PhyloTree::computeTraversalInfo(PhyloNeighbor *dad_branch, PhyloNode *dad, double* &buffer) {
+
+    size_t nstates = aln->num_states;
+    PhyloNode *node = (PhyloNode*)dad_branch->node;
+
+    if ((dad_branch->partial_lh_computed & 1) || node->isLeaf()) {
+        return mem_slots.lock(dad_branch);
+    }
+
+
+    size_t num_leaves = 0;
+    bool locked[node->degree()];
+    memset(locked, 0, node->degree());
+
+    // sort neighbor in desceding size order
+    NeighborVec neivec = node->neighbors;
+    NeighborVec::iterator it, i2;
+    for (it = neivec.begin(); it != neivec.end(); it++)
+        for (i2 = it+1; i2 != neivec.end(); i2++)
+            if (((PhyloNeighbor*)*it)->size < ((PhyloNeighbor*)*i2)->size) {
+                Neighbor *nei = *it;
+                *it = *i2;
+                *i2 = nei;
+            }
+
+
+    // recursive
+    for (it = neivec.begin(); it != neivec.end(); it++)
+        if ((*it)->node != dad) {
+            locked[it - neivec.begin()] = computeTraversalInfo((PhyloNeighbor*)(*it), node, buffer);
+            if ((*it)->node->isLeaf())
+                num_leaves++;
+        }
+    dad_branch->partial_lh_computed |= 1;
+
+    // prepare information for this branch
+    TraversalInfo info(dad_branch, dad);
+    info.echildren = info.partial_lh_leaves = NULL;
+
+    // re-orient partial_lh
+    reorientPartialLh(dad_branch, dad);
+
+    if (!dad_branch->partial_lh || mem_slots.locked(dad_branch)) {
+        // still no free entry found, memory saving technique
+        int slot_id = mem_slots.allocate(dad_branch);
+        if (slot_id < 0) {
+            cout << "traversal order:";
+            for (auto it = traversal_info.begin(); it != traversal_info.end(); it++) {
+                it->dad_branch->node->name = convertIntToString(it->dad_branch->size);
+                cout << "  ";
+                if (it->dad->isLeaf())
+                    cout << it->dad->name;
+                else
+                    cout << it->dad->id;
+                cout << "->";
+                if (it->dad_branch->node->isLeaf())
+                    cout << it->dad_branch->node->name;
+                else
+                    cout << it->dad_branch->node->id;
+                if (params->lh_mem_save == LM_MEM_SAVE) {
+                    if (it->dad_branch->partial_lh_computed)
+                        cout << " [";
+                    else
+                        cout << " (";
+                    cout << mem_slots.findNei(it->dad_branch) - mem_slots.begin();
+                    if (it->dad_branch->partial_lh_computed)
+                        cout << "]";
+                    else
+                        cout << ")";
+                }
+            }
+            cout << endl;
+            drawTree(cout);
+            assert(0 && "No free/unlocked mem slot found!");
+        }
+    } else
+        mem_slots.update(dad_branch);
+
+        if (verbose_mode >= VB_MED && params->lh_mem_save == LM_MEM_SAVE) {
+            int slot_id = mem_slots.findNei(dad_branch) - mem_slots.begin();
+            node->name = convertIntToString(slot_id);
+            cout << "Branch " << dad->id << "-" << node->id << " assigned slot " << slot_id << endl;
+        }
+
+    if (params->lh_mem_save == LM_MEM_SAVE) {
+        for (it = neivec.begin(); it != neivec.end(); it++)
+            if ((*it)->node != dad) {
+                if (!(*it)->node->isLeaf() && locked[it-neivec.begin()])
+                    mem_slots.unlock((PhyloNeighbor*)*it);
+            }
+    }
+
+    if (!model->isSiteSpecificModel()) {
+        //------- normal model -----
+        info.echildren = buffer;
+        size_t block = nstates * ((model_factory->fused_mix_rate) ? site_rate->getNRate() : site_rate->getNRate()*model->getNMixtures());
+        buffer += get_safe_upper_limit(block*nstates*(node->degree()-1));
+        if (num_leaves) {
+            info.partial_lh_leaves = buffer;
+            buffer += get_safe_upper_limit((aln->STATE_UNKNOWN+1)*block*num_leaves);
+        }
+    }
+
+    traversal_info.push_back(info);
+    return mem_slots.lock(dad_branch);
+}
+#endif
+
+
+
+#ifdef KERNEL_FIX_STATES
+template<class VectorClass, const int nstates>
+#else
+template<class VectorClass>
+#endif
+void PhyloTree::computePartialInfo(TraversalInfo &info, VectorClass* buffer) {
+
+#ifndef KERNEL_FIX_STATES
+    size_t nstates = aln->num_states;
+#endif
+
+    size_t c, i, x;
+    size_t ncat = site_rate->getNRate();
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
+    size_t block = nstates * ncat_mix;
+    size_t tip_block = nstates * model->getNMixtures();
+    size_t mix_addr_nstates[ncat_mix], mix_addr[ncat_mix];
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+    for (c = 0; c < ncat_mix; c++) {
+        size_t m = c/denom;
+        mix_addr_nstates[c] = m*nstates;
+        mix_addr[c] = mix_addr_nstates[c]*nstates;
+    }
+	double *evec = model->getEigenvectors();
+	double *eval = model->getEigenvalues();
+
+    PhyloNode *dad = info.dad, *node = (PhyloNode*)info.dad_branch->node;
+    double *echild = info.echildren;
+    double *partial_lh_leaf = info.partial_lh_leaves;
+
+    if (nstates % VectorClass::size() == 0) {
+        // vectorized version
+        VectorClass *expchild = (VectorClass*)buffer;
+        FOR_NEIGHBOR_IT(node, dad, it) {
+            PhyloNeighbor *child = (PhyloNeighbor*)*it;
+            VectorClass *echild_ptr = (VectorClass*)echild;
+            // precompute information buffer
+            for (c = 0; c < ncat_mix; c++) {
+                VectorClass len_child = site_rate->getRate(c%ncat) * child->length;
+                double *eval_ptr = eval + mix_addr_nstates[c];
+                double *evec_ptr = evec + mix_addr[c];
+                for (i = 0; i < nstates/VectorClass::size(); i++) {
+                    // eval is not aligned!
+                    expchild[i] = exp(VectorClass().load_a(&eval_ptr[i*VectorClass::size()]) * len_child);
+                }
+                for (x = 0; x < nstates; x++) {
+                    for (i = 0; i < nstates/VectorClass::size(); i++) {
+                        // evec is not be aligned!
+                        echild_ptr[i] = (VectorClass().load_a(&evec_ptr[x*nstates+i*VectorClass::size()]) * expchild[i]);
+                    }
+                    echild_ptr += nstates/VectorClass::size();
+                }
+            }
+            // pre compute information for tip
+            if (child->node->isLeaf()) {
+                vector<int>::iterator it;
+
+                for (it = aln->seq_states[child->node->id].begin(); it != aln->seq_states[child->node->id].end(); it++) {
+                    int state = (*it);
+                    double *this_partial_lh_leaf = partial_lh_leaf + state*block;
+                    VectorClass *echild_ptr = (VectorClass*)echild;
+                    for (c = 0; c < ncat_mix; c++) {
+                        VectorClass *this_tip_partial_lh = (VectorClass*)(tip_partial_lh + state*tip_block + mix_addr_nstates[c]);
+                        for (x = 0; x < nstates; x++) {
+                            VectorClass vchild = echild_ptr[0] * this_tip_partial_lh[0];
+                            for (i = 1; i < nstates/VectorClass::size(); i++) {
+                                vchild = mul_add(echild_ptr[i], this_tip_partial_lh[i], vchild);
+                            }
+                            this_partial_lh_leaf[x] = horizontal_add(vchild);
+                            echild_ptr += nstates/VectorClass::size();
+                        }
+                        this_partial_lh_leaf += nstates;
+                    }
+                }
+                size_t addr = aln->STATE_UNKNOWN * block;
+                for (x = 0; x < block; x++) {
+                    partial_lh_leaf[addr+x] = 1.0;
+                }
+                partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
+            }
+            echild += block*nstates;
+        }
+//        aligned_free(expchild);
+    } else {
+        // non-vectorized version
+        double expchild[nstates];
+        FOR_NEIGHBOR_IT(node, dad, it) {
+            PhyloNeighbor *child = (PhyloNeighbor*)*it;
+            // precompute information buffer
+            double *echild_ptr = echild;
+            for (c = 0; c < ncat_mix; c++) {
+                double len_child = site_rate->getRate(c%ncat) * child->length;
+                double *eval_ptr = eval + mix_addr_nstates[c];
+                double *evec_ptr = evec + mix_addr[c];
+                for (i = 0; i < nstates; i++) {
+                    expchild[i] = exp(eval_ptr[i]*len_child);
+                }
+                for (x = 0; x < nstates; x++) {
+                    for (i = 0; i < nstates; i++) {
+                        echild_ptr[i] = evec_ptr[x*nstates+i] * expchild[i];
+                    }
+                    echild_ptr += nstates;
+                }
+            }
+            // pre compute information for tip
+            if (child->node->isLeaf()) {
+                vector<int>::iterator it;
+                for (it = aln->seq_states[child->node->id].begin(); it != aln->seq_states[child->node->id].end(); it++) {
+                    int state = (*it);
+                    double *this_partial_lh_leaf = partial_lh_leaf + state*block;
+                    double *echild_ptr = echild;
+                    for (c = 0; c < ncat_mix; c++) {
+                        double *this_tip_partial_lh = tip_partial_lh + state*tip_block + mix_addr_nstates[c];
+                        for (x = 0; x < nstates; x++) {
+                            double vchild = echild_ptr[0] * this_tip_partial_lh[0];
+                            for (i = 1; i < nstates; i++) {
+                                vchild += echild_ptr[i] * this_tip_partial_lh[i];
+                            }
+                            this_partial_lh_leaf[x] = vchild;
+                            echild_ptr += nstates;
+                        }
+                        this_partial_lh_leaf += nstates;
+                    }
+                }
+                size_t addr = aln->STATE_UNKNOWN * block;
+                for (x = 0; x < block; x++) {
+                    partial_lh_leaf[addr+x] = 1.0;
+                }
+                partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
+            }
+            echild += block*nstates;
+        }
+    }
+
+}
+
+#ifndef KERNEL_FIX_STATES
+template<class VectorClass>
+inline void computeBounds(int threads, size_t elements, vector<size_t> &limits) {
+    limits.reserve(threads+1);
+    elements = ((elements+VectorClass::size()-1)/VectorClass::size())*VectorClass::size();
+    size_t rest_elem = elements;
+    limits.push_back(0);
+    size_t last = 0;
+    for (int rest_thread = threads; rest_thread > 1; rest_thread--) {
+        size_t block_size = rest_elem/rest_thread;
+        if (rest_elem % rest_thread != 0) block_size++;
+        // padding to the vector size
+        block_size = ((block_size+VectorClass::size()-1)/VectorClass::size())*VectorClass::size();
+
+        last += block_size;
+        if (last >= elements)
+            break;
+        limits.push_back(last);
+        rest_elem -= block_size;
+    }
+
+    limits.push_back(elements);
+    assert(limits.size() == threads+1);
+}
+#endif
+
+#ifdef KERNEL_FIX_STATES
+template<class VectorClass, const int nstates>
+#else
+template<class VectorClass>
+#endif
+void PhyloTree::computeTraversalInfo(PhyloNode *node, PhyloNode *dad, bool compute_partial_lh) {
+
+    if (!tip_partial_lh_computed)
+        computeTipPartialLikelihood();
+
+    traversal_info.clear();
+
+    // reserve beginning of buffer_partial_lh for other purpose
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? site_rate->getNRate() : site_rate->getNRate()*model->getNMixtures();
+    size_t block = aln->num_states * ncat_mix;
+    double *buffer = buffer_partial_lh + block*VectorClass::size()*num_threads + get_safe_upper_limit(block)*(aln->STATE_UNKNOWN+2);
+
+    // sort subtrees for mem save technique
+    if (params->lh_mem_save == LM_MEM_SAVE) {
+//        sortNeighborBySubtreeSize(node, dad);
+//        sortNeighborBySubtreeSize(dad, node);
+        int node_size = node->computeSize(dad);
+        int dad_size = dad->computeSize(node);
+//        PhyloNeighbor *dad_branch = (PhyloNeighbor*)dad->findNeighbor(node);
+//        PhyloNeighbor *node_branch = (PhyloNeighbor*)node->findNeighbor(dad);
+        if (node_size < dad_size) {
+            // swap node and dad due to tree size
+            PhyloNode *tmp = node;
+            node = dad;
+            dad = tmp;
+        }
+
+    }
+
+    PhyloNeighbor *dad_branch = (PhyloNeighbor*)dad->findNeighbor(node);
+    PhyloNeighbor *node_branch = (PhyloNeighbor*)node->findNeighbor(dad);
+    bool dad_locked = computeTraversalInfo(dad_branch, dad, buffer);
+    bool node_locked = computeTraversalInfo(node_branch, node, buffer);
+    if (params->lh_mem_save == LM_MEM_SAVE) {
+        if (dad_locked)
+            mem_slots.unlock(dad_branch);
+        if (node_locked)
+            mem_slots.unlock(node_branch);
+    }
+
+    if (verbose_mode >= VB_DEBUG && traversal_info.size() > 0) {
+        Node *saved = root;
+        root = dad;
+        drawTree(cout);
+        root = saved;
+    }
+
+    if (traversal_info.empty())
+        return;
+
+    if (!model->isSiteSpecificModel()) {
+
+        int num_info = traversal_info.size();
+
+        if (verbose_mode >= VB_DEBUG) {
+            cout << "traversal order:";
+            for (auto it = traversal_info.begin(); it != traversal_info.end(); it++) {
+                cout << "  ";
+                if (it->dad->isLeaf())
+                    cout << it->dad->name;
+                else
+                    cout << it->dad->id;
+                cout << "->";
+                if (it->dad_branch->node->isLeaf())
+                    cout << it->dad_branch->node->name;
+                else
+                    cout << it->dad_branch->node->id;
+                if (params->lh_mem_save == LM_MEM_SAVE) {
+                    if (it->dad_branch->partial_lh_computed)
+                        cout << " [";
+                    else
+                        cout << " (";
+                    cout << mem_slots.findNei(it->dad_branch) - mem_slots.begin();
+                    if (it->dad_branch->partial_lh_computed)
+                        cout << "]";
+                    else
+                        cout << ")";
+                }
+            }
+            cout << endl;
+        }
+
+#ifdef _OPENMP
+#pragma omp parallel if (num_info >= 3) num_threads(num_threads)
+        {
+            VectorClass *buffer_tmp = (VectorClass*)buffer + aln->num_states*omp_get_thread_num();
+#pragma omp for schedule(static)
+#else
+            VectorClass *buffer_tmp = (VectorClass*)buffer;
+#endif
+            for (int i = 0; i < num_info; i++) {
+            #ifdef KERNEL_FIX_STATES
+                computePartialInfo<VectorClass, nstates>(traversal_info[i], buffer_tmp);
+            #else
+                computePartialInfo<VectorClass>(traversal_info[i], buffer_tmp);
+            #endif
+            }
+#ifdef _OPENMP
+        }
+#endif
+    }
+
+    if (compute_partial_lh) {
+        vector<size_t> limits;
+        size_t orig_nptn = ((aln->size()+VectorClass::size()-1)/VectorClass::size())*VectorClass::size();
+        size_t nptn = ((orig_nptn+model_factory->unobserved_ptns.size()+VectorClass::size()-1)/VectorClass::size())*VectorClass::size();
+        computeBounds<VectorClass>(num_threads, nptn, limits);
+
+        #ifdef _OPENMP
+        #pragma omp parallel for schedule(static, 1) num_threads(num_threads)
+        #endif
+        for (int thread_id = 0; thread_id < num_threads; thread_id++) {
+            for (vector<TraversalInfo>::iterator it = traversal_info.begin(); it != traversal_info.end(); it++)
+                computePartialLikelihood(*it, limits[thread_id], limits[thread_id+1], thread_id);
+        }
+        traversal_info.clear();
+    }
+    return;
+}
+
+/*******************************************************
+ *
+ * NEW! highly-vectorized partial likelihood function
+ *
+ ******************************************************/
+
+#ifdef KERNEL_FIX_STATES
+template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA, const bool SITE_MODEL>
+void PhyloTree::computePartialLikelihoodSIMD(TraversalInfo &info, size_t ptn_lower, size_t ptn_upper, int thread_id)
+#else
+template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA, const bool SITE_MODEL>
+void PhyloTree::computePartialLikelihoodGenericSIMD(TraversalInfo &info, size_t ptn_lower, size_t ptn_upper, int thread_id)
+#endif
+{
+
+    PhyloNeighbor *dad_branch = info.dad_branch;
+    PhyloNode *dad = info.dad;
+    // don't recompute the likelihood
+	assert(dad);
+//    if (dad_branch->partial_lh_computed & 1)
+//        return;
+//    dad_branch->partial_lh_computed |= 1;
+    PhyloNode *node = (PhyloNode*)(dad_branch->node);
+
+
+#ifndef KERNEL_FIX_STATES
+    size_t nstates = aln->num_states;
+#endif
+    const size_t states_square = nstates*nstates;
+    size_t orig_nptn = aln->size();
+    size_t max_orig_nptn = ((orig_nptn+VectorClass::size()-1)/VectorClass::size())*VectorClass::size();
+    size_t nptn = max_orig_nptn+model_factory->unobserved_ptns.size();
+//    size_t max_nptn = ((nptn+VectorClass::size()-1)/VectorClass::size())*VectorClass::size();
+
+//    if (!tip_partial_lh_computed)
+//        computeTipPartialLikelihood();
+
+	if (node->isLeaf()) {
+//	    dad_branch->lh_scale_factor = 0.0;
+		return;
+	}
+    
+    size_t ptn, c;
+    size_t ncat = site_rate->getNRate();
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
+    size_t mix_addr_nstates[ncat_mix], mix_addr[ncat_mix];
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+    for (c = 0; c < ncat_mix; c++) {
+        size_t m = c/denom;
+        mix_addr_nstates[c] = m*nstates;
+        mix_addr[c] = mix_addr_nstates[c]*nstates;
+    }
+    size_t i, x;
+    size_t block = nstates * ncat_mix;
+//    size_t tip_block = nstates * model->getNMixtures();
+    size_t tip_mem_size = max_orig_nptn * nstates;
+//    size_t scale_size = SAFE_NUMERIC ? max_nptn * ncat_mix : max_nptn;
+    size_t scale_size = SAFE_NUMERIC ? (ptn_upper-ptn_lower) * ncat_mix : (ptn_upper-ptn_lower);
+
+	double *evec = model->getEigenvectors();
+	double *inv_evec = model->getInverseEigenvectors();
+	assert(inv_evec && evec);
+	double *eval = model->getEigenvalues();
+
+	// internal node
+	PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
+	FOR_NEIGHBOR_IT(node, dad, it) {
+        PhyloNeighbor *nei = (PhyloNeighbor*)(*it);
+        // make sure that the partial_lh of children are different!
+        assert(dad_branch->partial_lh != nei->partial_lh);
+		if (!left) left = nei; else right = nei;
+	}
+
+    // precomputed buffer to save times
+    double *buffer_partial_lh_ptr = buffer_partial_lh + (getBufferPartialLhSize() - (2*block+nstates)*VectorClass::size()*num_threads);
+    double *echildren = NULL;
+    double *partial_lh_leaves = NULL;
+
+    // pre-compute scaled branch length per category
+    double len_children[ncat*(node->degree()-1)]; // +1 in case num_leaves = 0
+    double *len_left = NULL, *len_right = NULL;
+
+    if (SITE_MODEL) {
+        double *len_children_ptr = len_children;
+        FOR_NEIGHBOR_IT(node, dad, it3) {
+            for (c = 0; c < ncat; c++) {
+                len_children_ptr[c] = site_rate->getRate(c) * (*it3)->length;
+            }
+            if (!len_left)
+                len_left = len_children_ptr;
+            else
+                len_right = len_children_ptr;
+            len_children_ptr += ncat;
+        }
+    } else {
+
+        echildren = info.echildren;
+        partial_lh_leaves = info.partial_lh_leaves;
+
+    }
+
+    double *eleft = echildren, *eright = echildren + block*nstates;
+
+	if (!left->node->isLeaf() && right->node->isLeaf()) {
+		PhyloNeighbor *tmp = left;
+		left = right;
+		right = tmp;
+        double *etmp = eleft;
+        eleft = eright;
+        eright = etmp;
+        etmp = len_left;
+        len_left = len_right;
+        len_right = etmp;
+	}
+
+    if (node->degree() > 3) {
+        /*--------------------- multifurcating node ------------------*/
+
+        // now for-loop computing partial_lh over all site-patterns
+        VectorClass *partial_lh_all = (VectorClass*) &buffer_partial_lh_ptr[block*VectorClass::size()*2*thread_id];
+        double *vec_tip = (double*)&partial_lh_all[block];
+
+        for (ptn = ptn_lower; ptn < ptn_upper; ptn+=VectorClass::size()) {
+            for (i = 0; i < block; i++)
+                partial_lh_all[i] = 1.0;
+            UBYTE *scale_dad = NULL;
+            if (SAFE_NUMERIC) {
+                scale_dad = dad_branch->scale_num + ptn*ncat_mix;
+                memset(scale_dad, 0, sizeof(UBYTE)*ncat_mix*VectorClass::size());
+            } else
+                memset(&dad_branch->scale_num[ptn], 0, sizeof(UBYTE)*VectorClass::size());
+
+            if (SITE_MODEL) {
+                VectorClass *expchild = partial_lh_all + block;
+                VectorClass *eval_ptr = (VectorClass*) &eval[ptn*nstates];
+                VectorClass *evec_ptr = (VectorClass*) &evec[ptn*states_square];
+                double *len_child = len_children;
+                VectorClass vchild;
+                FOR_NEIGHBOR_IT(node, dad, it) {
+                    PhyloNeighbor *child = (PhyloNeighbor*)*it;
+                    UBYTE *scale_child = SAFE_NUMERIC ? child->scale_num + ptn*ncat_mix : NULL;
+                    VectorClass *partial_lh = partial_lh_all;
+                    if (child->node->isLeaf()) {
+                        // external node
+                        VectorClass *tip_partial_lh_child = (VectorClass*) &tip_partial_lh[child->node->id*tip_mem_size + ptn*nstates];
+                        for (c = 0; c < ncat; c++) {
+                            for (i = 0; i < nstates; i++)
+                                expchild[i] = exp(eval_ptr[i]*len_child[c]) * tip_partial_lh_child[i];
+                            for (x = 0; x < nstates; x++) {
+                                VectorClass *this_evec = &evec_ptr[x*nstates];
+#ifdef KERNEL_FIX_STATES
+                                dotProductVec<VectorClass, VectorClass, nstates, FMA>(expchild, this_evec, vchild);
+#else
+                                dotProductVec<VectorClass, VectorClass, FMA>(expchild, this_evec, vchild, nstates);
+#endif
+                                partial_lh[x] *= vchild;
+                            }
+                            partial_lh += nstates;
+                        }
+                    } else {
+                        // internal node
+                        VectorClass *partial_lh = partial_lh_all;
+                        VectorClass *partial_lh_child = (VectorClass*)(child->partial_lh + ptn*block);
+                        if (!SAFE_NUMERIC) {
+                            for (i = 0; i < VectorClass::size(); i++)
+                                dad_branch->scale_num[ptn+i] += child->scale_num[ptn+i];
+                        }
+
+                        for (c = 0; c < ncat_mix; c++) {
+                            if (SAFE_NUMERIC) {
+                                for (x = 0; x < VectorClass::size(); x++)
+                                    scale_dad[x*ncat_mix+c] += scale_child[x*ncat_mix+c];
+                            }
+                            // compute real partial likelihood vector
+                            for (i = 0; i < nstates; i++)
+                                expchild[i] = exp(eval_ptr[i]*len_child[c]) * partial_lh_child[i];
+                            for (x = 0; x < nstates; x++) {
+                                VectorClass *this_evec = &evec_ptr[x*nstates];
+#ifdef KERNEL_FIX_STATES
+                                dotProductVec<VectorClass, VectorClass, nstates, FMA>(expchild, this_evec, vchild);
+#else
+                                dotProductVec<VectorClass, VectorClass, FMA>(expchild, this_evec, vchild, nstates);
+#endif
+                                partial_lh[x] *= vchild;
+                            }
+                            partial_lh += nstates;
+                            partial_lh_child += nstates;
+                        }
+                    } // if
+                    len_child += ncat;
+                } // FOR_NEIGHBOR
+
+            } else {
+                // non site specific model
+                double *partial_lh_leaf = partial_lh_leaves;
+                double *echild = echildren;
+
+                FOR_NEIGHBOR_IT(node, dad, it) {
+                    PhyloNeighbor *child = (PhyloNeighbor*)*it;
+                    UBYTE *scale_child = SAFE_NUMERIC ? child->scale_num + ptn*ncat_mix : NULL;
+                    if (child->node->isLeaf()) {
+                        // external node
+                        // load data for tip
+                        for (i = 0; i < VectorClass::size(); i++) {
+                            double *child_lh;
+                            if (ptn+i < orig_nptn)
+                                child_lh = partial_lh_leaf + block*(aln->at(ptn+i))[child->node->id];
+                            else if (ptn+i < max_orig_nptn)
+                                child_lh = partial_lh_leaf + block*aln->STATE_UNKNOWN;
+                            else if (ptn+i < nptn)
+                                child_lh = partial_lh_leaf + block*model_factory->unobserved_ptns[ptn+i-max_orig_nptn];
+                            else
+                                child_lh = partial_lh_leaf + block*aln->STATE_UNKNOWN;
+                            double *this_vec_tip = vec_tip+i;
+                            for (c = 0; c < block; c++) {
+                                *this_vec_tip = child_lh[c];
+                                this_vec_tip += VectorClass::size();
+                            }
+                        }
+                        VectorClass *vtip = (VectorClass*)vec_tip;
+                        for (c = 0; c < block; c++) {
+                            // compute real partial likelihood vector
+                            partial_lh_all[c] *= vtip[c];
+                        }
+                        partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
+                    } else {
+                        // internal node
+                        VectorClass *partial_lh = partial_lh_all;
+                        VectorClass *partial_lh_child = (VectorClass*)(child->partial_lh + ptn*block);
+                        if (!SAFE_NUMERIC) {
+                            for (i = 0; i < VectorClass::size(); i++)
+                                dad_branch->scale_num[ptn+i] += child->scale_num[ptn+i];
+                        }
+
+                        double *echild_ptr = echild;
+                        for (c = 0; c < ncat_mix; c++) {
+                            if (SAFE_NUMERIC) {
+                                for (x = 0; x < VectorClass::size(); x++)
+                                    scale_dad[x*ncat_mix+c] += scale_child[x*ncat_mix+c];
+                            }
+                            // compute real partial likelihood vector
+                            for (x = 0; x < nstates; x++) {
+                                VectorClass vchild = echild_ptr[0] * partial_lh_child[0];
+    //                            double *echild_ptr = echild + (c*nstatesqr+x*nstates);
+                                for (i = 1; i < nstates; i++) {
+                                    vchild = mul_add(echild_ptr[i], partial_lh_child[i], vchild);
+                                }
+                                echild_ptr += nstates;
+                                partial_lh[x] *= vchild;
+                            }
+                            partial_lh += nstates;
+                            partial_lh_child += nstates;
+                        }
+                    } // if
+                    echild += block*nstates;
+                } // FOR_NEIGHBOR
+            } // if SITE_MODEL
+            
+        
+            // compute dot-product with inv_eigenvector
+            VectorClass *partial_lh_tmp = partial_lh_all;
+            VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+            VectorClass lh_max = 0.0;
+            double *inv_evec_ptr = SITE_MODEL ? &inv_evec[ptn*states_square] : NULL;
+            for (c = 0; c < ncat_mix; c++) {
+                if (SAFE_NUMERIC)
+                    lh_max = 0.0;
+                if (SITE_MODEL) {
+                    // compute dot-product with inv_eigenvector
+#ifdef KERNEL_FIX_STATES
+                    productVecMat<VectorClass, VectorClass, nstates, FMA>(partial_lh_tmp, (VectorClass*)inv_evec_ptr, partial_lh);
+#else
+                    productVecMat<VectorClass, VectorClass, FMA> (partial_lh_tmp, (VectorClass*)inv_evec_ptr, partial_lh, nstates);
+#endif
+                } else {
+                    inv_evec_ptr = inv_evec + mix_addr[c];
+#ifdef KERNEL_FIX_STATES
+                    productVecMat<VectorClass, double, nstates, FMA>(partial_lh_tmp, inv_evec_ptr, partial_lh, lh_max);
+#else
+                    productVecMat<VectorClass, double, FMA> (partial_lh_tmp, inv_evec_ptr, partial_lh, lh_max, nstates);
+#endif
+                }
+                // check if one should scale partial likelihoods
+                if (SAFE_NUMERIC) {
+                    auto underflown = ((lh_max < SCALING_THRESHOLD) & (lh_max != 0.0) & (VectorClass().load_a(&ptn_invar[ptn]) == 0.0));
+                    if (horizontal_or(underflown)) { // at least one site has numerical underflown
+                        for (x = 0; x < VectorClass::size(); x++)
+                        if (underflown[x]) {
+                            // BQM 2016-05-03: only scale for non-constant sites
+                            // now do the likelihood scaling
+                            double *partial_lh = dad_branch->partial_lh + (ptn*block + c*nstates*VectorClass::size() + x);
+                            for (i = 0; i < nstates; i++)
+                                partial_lh[i*VectorClass::size()] *= SCALING_THRESHOLD_INVER;
+                            dad_branch->scale_num[(ptn+x)*ncat_mix+c] += 1;
+                        }
+                    }
+                }
+                partial_lh += nstates;
+                partial_lh_tmp += nstates;
+            }
+
+            if (!SAFE_NUMERIC) {
+                auto underflown = (lh_max < SCALING_THRESHOLD) & (lh_max != 0.0) & (VectorClass().load_a(&ptn_invar[ptn]) == 0.0);
+                if (horizontal_or(underflown)) { // at least one site has numerical underflown
+                    for (x = 0; x < VectorClass::size(); x++)
+                    if (underflown[x]) {
+                        double *partial_lh = dad_branch->partial_lh + (ptn*block + x);
+                        // now do the likelihood scaling
+                        for (i = 0; i < block; i++) {
+                            partial_lh[i*VectorClass::size()] *= SCALING_THRESHOLD_INVER;
+                        }
+//                        sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn+x];
+                        dad_branch->scale_num[ptn+x] += 1;
+                    }
+                }
+            }
+
+        } // for ptn
+
+        // end multifurcating treatment
+    } else if (left->node->isLeaf() && right->node->isLeaf()) {
+
+        /*--------------------- TIP-TIP (cherry) case ------------------*/
+
+        double *partial_lh_left = SITE_MODEL ? &tip_partial_lh[left->node->id * tip_mem_size] : partial_lh_leaves;
+        double *partial_lh_right = SITE_MODEL ? &tip_partial_lh[right->node->id * tip_mem_size] : partial_lh_leaves + (aln->STATE_UNKNOWN+1)*block;
+
+		// scale number must be ZERO
+	    memset(dad_branch->scale_num + (SAFE_NUMERIC ? ptn_lower*ncat_mix : ptn_lower), 0, scale_size * sizeof(UBYTE));
+        double *vec_left = buffer_partial_lh_ptr + (block*2 + nstates)*VectorClass::size()*thread_id;
+
+        double *vec_right =  SITE_MODEL ? &vec_left[nstates*VectorClass::size()] : &vec_left[block*VectorClass::size()];
+        VectorClass *partial_lh_tmp = SITE_MODEL ? (VectorClass*)vec_right+nstates : (VectorClass*)vec_right+block;
+
+		for (ptn = ptn_lower; ptn < ptn_upper; ptn+=VectorClass::size()) {
+			VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+
+            if (SITE_MODEL) {
+                VectorClass* expleft = (VectorClass*) vec_left;
+                VectorClass* expright = (VectorClass*) vec_right;
+                VectorClass *vleft = (VectorClass*) &partial_lh_left[ptn*nstates];
+                VectorClass *vright = (VectorClass*) &partial_lh_right[ptn*nstates];
+                VectorClass *eval_ptr = (VectorClass*) &eval[ptn*nstates];
+                VectorClass *evec_ptr = (VectorClass*) &evec[ptn*states_square];
+                VectorClass *inv_evec_ptr = (VectorClass*) &inv_evec[ptn*states_square];
+                for (c = 0; c < ncat; c++) {
+                    for (i = 0; i < nstates; i++) {
+                        expleft[i] = exp(eval_ptr[i]*len_left[c]) * vleft[i];
+                        expright[i] = exp(eval_ptr[i]*len_right[c]) * vright[i];
+
+                    }
+                    // compute real partial likelihood vector
+                    for (x = 0; x < nstates; x++) {
+                        VectorClass *this_evec = evec_ptr + x*nstates;
+#ifdef KERNEL_FIX_STATES
+                        dotProductDualVec<VectorClass, VectorClass, nstates, FMA>(this_evec, expleft, this_evec, expright, partial_lh_tmp[x]);
+#else
+                        dotProductDualVec<VectorClass, VectorClass, FMA>(this_evec, expleft, this_evec, expright, partial_lh_tmp[x], nstates);
+#endif
+                    }
+                    // compute dot-product with inv_eigenvector
+#ifdef KERNEL_FIX_STATES
+                    productVecMat<VectorClass, VectorClass, nstates, FMA>(partial_lh_tmp, inv_evec_ptr, partial_lh);
+#else
+                    productVecMat<VectorClass, VectorClass, FMA> (partial_lh_tmp, inv_evec_ptr, partial_lh, nstates);
+#endif
+                    partial_lh += nstates;
+                } // FOR category
+            } else {
+                VectorClass *vleft = (VectorClass*)vec_left;
+                VectorClass *vright = (VectorClass*)vec_right;
+                // load data for tip
+                for (x = 0; x < VectorClass::size(); x++) {
+                    double *tip_left, *tip_right;
+                    if (ptn+x < orig_nptn) {
+                        tip_left  = partial_lh_left  + block * (aln->at(ptn+x))[left->node->id];
+                        tip_right = partial_lh_right + block * (aln->at(ptn+x))[right->node->id];
+                    } else if (ptn+x < max_orig_nptn) {
+                        tip_left  = partial_lh_left  + block * aln->STATE_UNKNOWN;
+                        tip_right = partial_lh_right + block * aln->STATE_UNKNOWN;
+                    } else if (ptn+x < nptn) {
+                        tip_left  = partial_lh_left  + block * model_factory->unobserved_ptns[ptn+x-max_orig_nptn];
+                        tip_right = partial_lh_right + block * model_factory->unobserved_ptns[ptn+x-max_orig_nptn];
+                    } else {
+                        tip_left  = partial_lh_left  + block * aln->STATE_UNKNOWN;
+                        tip_right = partial_lh_right + block * aln->STATE_UNKNOWN;
+                    }
+                    double *this_vec_left = vec_left+x;
+                    double *this_vec_right = vec_right+x;
+                    for (i = 0; i < block; i++) {
+                        *this_vec_left = tip_left[i];
+                        *this_vec_right = tip_right[i];
+                        this_vec_left += VectorClass::size();
+                        this_vec_right += VectorClass::size();
+                    }
+                }
+
+
+                for (c = 0; c < ncat_mix; c++) {
+                    double *inv_evec_ptr = inv_evec + mix_addr[c];
+                    // compute real partial likelihood vector
+                    for (x = 0; x < nstates; x++) {
+                        partial_lh_tmp[x] = vleft[x] * vright[x];
+                    }
+
+                    // compute dot-product with inv_eigenvector
+#ifdef KERNEL_FIX_STATES
+                    productVecMat<VectorClass, double, nstates, FMA>(partial_lh_tmp, inv_evec_ptr, partial_lh);
+#else
+                    productVecMat<VectorClass, double, FMA> (partial_lh_tmp, inv_evec_ptr, partial_lh, nstates);
+#endif
+
+                    // increase pointer
+                    vleft += nstates;
+                    vright += nstates;
+                    partial_lh += nstates;
+                } // FOR category
+            } // IF SITE_MODEL
+		} // FOR LOOP
+
+
+	} else if (left->node->isLeaf() && !right->node->isLeaf()) {
+
+        /*--------------------- TIP-INTERNAL NODE case ------------------*/
+
+		// only take scale_num from the right subtree
+		memcpy(
+            dad_branch->scale_num + (SAFE_NUMERIC ? ptn_lower*ncat_mix : ptn_lower),
+            right->scale_num + (SAFE_NUMERIC ? ptn_lower*ncat_mix : ptn_lower),
+            scale_size * sizeof(UBYTE));
+
+        double *partial_lh_left = SITE_MODEL ? &tip_partial_lh[left->node->id * tip_mem_size] : partial_lh_leaves;
+
+
+        double *vec_left = buffer_partial_lh_ptr + (2*block+nstates)*VectorClass::size()*thread_id;
+        VectorClass *partial_lh_tmp = SITE_MODEL ? (VectorClass*)vec_left+2*nstates : (VectorClass*)vec_left+block;
+
+		for (ptn = ptn_lower; ptn < ptn_upper; ptn+=VectorClass::size()) {
+			VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+			VectorClass *partial_lh_right = (VectorClass*)(right->partial_lh + ptn*block);
+//            memset(partial_lh, 0, sizeof(VectorClass)*block);
+            VectorClass lh_max = 0.0;
+
+            if (SITE_MODEL) {
+                VectorClass *expleft = (VectorClass*)vec_left;
+                VectorClass *expright = expleft+nstates;
+                VectorClass *vleft = (VectorClass*)&partial_lh_left[ptn*nstates];
+                VectorClass *eval_ptr = (VectorClass*) &eval[ptn*nstates];
+                VectorClass *evec_ptr = (VectorClass*) &evec[ptn*states_square];
+                VectorClass *inv_evec_ptr = (VectorClass*) &inv_evec[ptn*states_square];
+                for (c = 0; c < ncat; c++) {
+                    for (i = 0; i < nstates; i++) {
+                        expleft[i] = exp(eval_ptr[i]*len_left[c]) * vleft[i];
+                        expright[i] = exp(eval_ptr[i]*len_right[c]) * partial_lh_right[i];
+                    }
+                    // compute real partial likelihood vector
+                    for (x = 0; x < nstates; x++) {
+                        VectorClass *this_evec = evec_ptr + x*nstates;
+#ifdef KERNEL_FIX_STATES
+                        dotProductDualVec<VectorClass, VectorClass, nstates, FMA>(this_evec, expleft, this_evec, expright, partial_lh_tmp[x]);
+#else
+                        dotProductDualVec<VectorClass, VectorClass, FMA>(this_evec, expleft, this_evec, expright, partial_lh_tmp[x], nstates);
+#endif
+                    }
+                    // compute dot-product with inv_eigenvector
+#ifdef KERNEL_FIX_STATES
+                    productVecMat<VectorClass, VectorClass, nstates, FMA>(partial_lh_tmp, inv_evec_ptr, partial_lh, lh_max);
+#else
+                    productVecMat<VectorClass, VectorClass, FMA> (partial_lh_tmp, inv_evec_ptr, partial_lh, lh_max, nstates);
+#endif
+                    // check if one should scale partial likelihoods
+                    if (SAFE_NUMERIC) {
+                        auto underflown = ((lh_max < SCALING_THRESHOLD) & (lh_max != 0.0) & (VectorClass().load_a(&ptn_invar[ptn]) == 0.0));
+                        if (horizontal_or(underflown)) { // at least one site has numerical underflown
+                            for (x = 0; x < VectorClass::size(); x++)
+                            if (underflown[x]) {
+                                // BQM 2016-05-03: only scale for non-constant sites
+                                // now do the likelihood scaling
+                                double *partial_lh = dad_branch->partial_lh + (ptn*block + c*nstates*VectorClass::size() + x);
+                                for (i = 0; i < nstates; i++)
+                                    partial_lh[i*VectorClass::size()] *= SCALING_THRESHOLD_INVER;
+                                dad_branch->scale_num[(ptn+x)*ncat_mix+c] += 1;
+                            }
+                        }
+                    }
+                    partial_lh_right += nstates;
+                    partial_lh += nstates;
+                } // FOR category
+
+            } else {
+                VectorClass *vleft = (VectorClass*)vec_left;
+                // load data for tip
+                for (x = 0; x < VectorClass::size(); x++) {
+                    double *tip;
+                    if (ptn+x < orig_nptn) {
+                        tip = partial_lh_left + block*(aln->at(ptn+x))[left->node->id];
+                    } else if (ptn+x < max_orig_nptn) {
+                        tip = partial_lh_left + block*aln->STATE_UNKNOWN;
+                    } else if (ptn+x < nptn) {
+                        tip = partial_lh_left + block*model_factory->unobserved_ptns[ptn+x-max_orig_nptn];
+                    } else {
+                        tip = partial_lh_left + block*aln->STATE_UNKNOWN;
+                    }
+                    double *this_vec_left = vec_left+x;
+                    for (i = 0; i < block; i++) {
+                        *this_vec_left = tip[i];
+                        this_vec_left += VectorClass::size();
+                    }
+                }
+
+                double *eright_ptr = eright;
+                for (c = 0; c < ncat_mix; c++) {
+                    if (SAFE_NUMERIC)
+                        lh_max = 0.0;
+                    double *inv_evec_ptr = inv_evec + mix_addr[c];
+                    // compute real partial likelihood vector
+                    for (x = 0; x < nstates; x++) {
+                        VectorClass vright;
+    #ifdef KERNEL_FIX_STATES
+                        dotProductVec<VectorClass, double, nstates, FMA>(eright_ptr, partial_lh_right, vright);
+    #else
+                        dotProductVec<VectorClass, double, FMA>(eright_ptr, partial_lh_right, vright, nstates);
+    #endif
+                        eright_ptr += nstates;
+                        partial_lh_tmp[x] = vleft[x] * (vright);
+                    }
+
+                    // compute dot-product with inv_eigenvector
+    #ifdef KERNEL_FIX_STATES
+                    productVecMat<VectorClass, double, nstates, FMA>(partial_lh_tmp, inv_evec_ptr, partial_lh, lh_max);
+    #else
+                    productVecMat<VectorClass, double, FMA> (partial_lh_tmp, inv_evec_ptr, partial_lh, lh_max, nstates);
+    #endif
+                    // check if one should scale partial likelihoods
+                    if (SAFE_NUMERIC) {
+                        auto underflown = ((lh_max < SCALING_THRESHOLD) & (lh_max != 0.0) & (VectorClass().load_a(&ptn_invar[ptn]) == 0.0));
+                        if (horizontal_or(underflown)) { // at least one site has numerical underflown
+                            for (x = 0; x < VectorClass::size(); x++)
+                            if (underflown[x]) {
+                                // BQM 2016-05-03: only scale for non-constant sites
+                                // now do the likelihood scaling
+                                double *partial_lh = dad_branch->partial_lh + (ptn*block + c*nstates*VectorClass::size() + x);
+                                for (i = 0; i < nstates; i++)
+                                    partial_lh[i*VectorClass::size()] *= SCALING_THRESHOLD_INVER;
+                                dad_branch->scale_num[(ptn+x)*ncat_mix+c] += 1;
+                            }
+                        }
+                    }
+                    vleft += nstates;
+                    partial_lh_right += nstates;
+                    partial_lh += nstates;
+                } // FOR category
+            } // IF SITE_MODEL
+
+            if (!SAFE_NUMERIC) {
+                auto underflown = (lh_max < SCALING_THRESHOLD) & (lh_max != 0.0) & (VectorClass().load_a(&ptn_invar[ptn]) == 0.0);
+                if (horizontal_or(underflown)) { // at least one site has numerical underflown
+                    for (x = 0; x < VectorClass::size(); x++)
+                    if (underflown[x]) {
+                        double *partial_lh = dad_branch->partial_lh + (ptn*block + x);
+                        // now do the likelihood scaling
+                        for (i = 0; i < block; i++) {
+                            partial_lh[i*VectorClass::size()] *= SCALING_THRESHOLD_INVER;
+                        }
+//                        sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn+x];
+                        dad_branch->scale_num[ptn+x] += 1;
+                    }
+                }
+            }
+
+		} // big for loop over ptn
+
+	} else {
+
+        /*--------------------- INTERNAL-INTERNAL NODE case ------------------*/
+
+        VectorClass *partial_lh_tmp = (VectorClass*)buffer_partial_lh_ptr + (2*block+nstates)*thread_id;
+		for (ptn = ptn_lower; ptn < ptn_upper; ptn+=VectorClass::size()) {
+			VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+			VectorClass *partial_lh_left = (VectorClass*)(left->partial_lh + ptn*block);
+			VectorClass *partial_lh_right = (VectorClass*)(right->partial_lh + ptn*block);
+            VectorClass lh_max = 0.0;
+            UBYTE *scale_dad, *scale_left, *scale_right;
+
+            if (SAFE_NUMERIC) {
+                size_t addr = ptn*ncat_mix;
+                scale_dad = dad_branch->scale_num + addr;
+                scale_left = left->scale_num + addr;
+                scale_right = right->scale_num + addr;
+            } else {
+                scale_dad = dad_branch->scale_num + ptn;
+                scale_left = left->scale_num + ptn;
+                scale_right = right->scale_num + ptn;
+                for (i = 0; i < VectorClass::size(); i++)
+                    scale_dad[i] = scale_left[i] + scale_right[i];
+            }
+
+            double *eleft_ptr = eleft;
+            double *eright_ptr = eright;
+            VectorClass *expleft, *expright, *eval_ptr, *evec_ptr, *inv_evec_ptr;
+            if (SITE_MODEL) {
+                expleft = partial_lh_tmp + nstates;
+                expright = expleft + nstates;
+                eval_ptr = (VectorClass*) &eval[ptn*nstates];
+                evec_ptr = (VectorClass*) &evec[ptn*states_square];
+                inv_evec_ptr = (VectorClass*) &inv_evec[ptn*states_square];
+            }
+
+			for (c = 0; c < ncat_mix; c++) {
+                if (SAFE_NUMERIC) {
+                    lh_max = 0.0;
+                    for (x = 0; x < VectorClass::size(); x++)
+                        scale_dad[x*ncat_mix] = scale_left[x*ncat_mix] + scale_right[x*ncat_mix];
+                }
+
+                if (SITE_MODEL) {
+                    // site-specific model
+                    for (i = 0; i < nstates; i++) {
+                        expleft[i] = exp(eval_ptr[i]*len_left[c]) * partial_lh_left[i];
+                        expright[i] = exp(eval_ptr[i]*len_right[c]) * partial_lh_right[i];
+                    }
+                    for (x = 0; x < nstates; x++) {
+                        VectorClass *this_evec = evec_ptr + x*nstates;
+#ifdef KERNEL_FIX_STATES
+                        dotProductDualVec<VectorClass, VectorClass, nstates, FMA>(this_evec, expleft, this_evec, expright, partial_lh_tmp[x]);
+#else
+                        dotProductDualVec<VectorClass, VectorClass, FMA>(this_evec, expleft, this_evec, expright, partial_lh_tmp[x], nstates);
+#endif
+                    }
+#ifdef KERNEL_FIX_STATES
+                    productVecMat<VectorClass, VectorClass, nstates, FMA>(partial_lh_tmp, inv_evec_ptr, partial_lh, lh_max);
+#else
+                    productVecMat<VectorClass, VectorClass, FMA> (partial_lh_tmp, inv_evec_ptr, partial_lh, lh_max, nstates);
+#endif
+                } else {
+                    // normal model
+                    double *inv_evec_ptr = inv_evec + mix_addr[c];
+                    // compute real partial likelihood vector
+                    for (x = 0; x < nstates; x++) {
+#ifdef KERNEL_FIX_STATES
+                        dotProductDualVec<VectorClass, double, nstates, FMA>(eleft_ptr, partial_lh_left, eright_ptr, partial_lh_right, partial_lh_tmp[x]);
+#else
+                        dotProductDualVec<VectorClass, double, FMA>(eleft_ptr, partial_lh_left, eright_ptr, partial_lh_right, partial_lh_tmp[x], nstates);
+#endif
+                        eleft_ptr += nstates;
+                        eright_ptr += nstates;
+                    }
+                    
+                    // compute dot-product with inv_eigenvector
+#ifdef KERNEL_FIX_STATES
+                    productVecMat<VectorClass, double, nstates, FMA>(partial_lh_tmp, inv_evec_ptr, partial_lh, lh_max);
+#else
+                    productVecMat<VectorClass, double, FMA> (partial_lh_tmp, inv_evec_ptr, partial_lh, lh_max, nstates);
+#endif
+                }
+
+                // check if one should scale partial likelihoods
+                if (SAFE_NUMERIC) {
+                    auto underflown = ((lh_max < SCALING_THRESHOLD) & (lh_max != 0.0) & (VectorClass().load_a(&ptn_invar[ptn]) == 0.0));
+                    if (horizontal_or(underflown))
+                        for (x = 0; x < VectorClass::size(); x++)
+                        if (underflown[x]) {
+                            // BQM 2016-05-03: only scale for non-constant sites
+                            // now do the likelihood scaling
+                            double *partial_lh = dad_branch->partial_lh + (ptn*block + c*nstates*VectorClass::size() + x);
+                            for (i = 0; i < nstates; i++)
+                                partial_lh[i*VectorClass::size()] *= SCALING_THRESHOLD_INVER;
+                            scale_dad[x*ncat_mix] += 1;
+                        }
+                    scale_dad++;
+                    scale_left++;
+                    scale_right++;
+                }
+                partial_lh_left += nstates;
+                partial_lh_right += nstates;
+                partial_lh += nstates;
+			}
+
+            if (!SAFE_NUMERIC) {
+                // check if one should scale partial likelihoods
+                auto underflown = (lh_max < SCALING_THRESHOLD) & (lh_max != 0.0) & (VectorClass().load_a(&ptn_invar[ptn]) == 0.0);
+                if (horizontal_or(underflown)) { // at least one site has numerical underflown
+                    for (x = 0; x < VectorClass::size(); x++)
+                    if (underflown[x]) {
+                        double *partial_lh = dad_branch->partial_lh + (ptn*block + x);
+                        // now do the likelihood scaling
+                        for (i = 0; i < block; i++) {
+                            partial_lh[i*VectorClass::size()] *= SCALING_THRESHOLD_INVER;
+                        }
+//                        sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn+x];
+                        dad_branch->scale_num[ptn+x] += 1;
+                    }
+                }
+            }
+
+		} // big for loop over ptn
+
+	}
+}
+
+/*******************************************************
+ *
+ * NEW! highly-vectorized log-likelihood derivative function
+ *
+ ******************************************************/
+
+
+#ifdef KERNEL_FIX_STATES
+template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA, const bool SITE_MODEL>
+void PhyloTree::computeLikelihoodBufferSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, size_t ptn_lower, size_t ptn_upper, int thread_id)
+#else
+template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA, const bool SITE_MODEL>
+void PhyloTree::computeLikelihoodBufferGenericSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, size_t ptn_lower, size_t ptn_upper, int thread_id)
+#endif
+{
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+
+#ifndef KERNEL_FIX_STATES
+    size_t nstates = aln->num_states;
+#endif
+    size_t orig_nptn = aln->size();
+    size_t max_orig_nptn = ((orig_nptn+VectorClass::size()-1)/VectorClass::size())*VectorClass::size();
+    size_t nptn = max_orig_nptn+model_factory->unobserved_ptns.size();
+    size_t ptn, i, c;
+    size_t ncat = site_rate->getNRate();
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
+
+    size_t block = ncat_mix * nstates;
+    size_t tip_block = nstates * model->getNMixtures();
+    size_t mix_addr_nstates[ncat_mix], mix_addr[ncat_mix];
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+    for (c = 0; c < ncat_mix; c++) {
+        size_t m = c/denom;
+        mix_addr_nstates[c] = m*nstates;
+        mix_addr[c] = mix_addr_nstates[c]*nstates;
+    }
+
+    // reserve 3*block for computeLikelihoodDerv
+    double *buffer_partial_lh_ptr = buffer_partial_lh + 3*get_safe_upper_limit(block);
+
+    // first compute partial_lh
+    for (vector<TraversalInfo>::iterator it = traversal_info.begin(); it != traversal_info.end(); it++)
+        computePartialLikelihood(*it, ptn_lower, ptn_upper, thread_id);
+
+    if (dad->isLeaf()) {
+        // special treatment for TIP-INTERNAL NODE case
+        double *tip_partial_lh_node = &tip_partial_lh[dad->id * max_orig_nptn*nstates];
+
+        double *vec_tip = buffer_partial_lh_ptr + tip_block*VectorClass::size()*thread_id;
+
+        for (ptn = ptn_lower; ptn < ptn_upper; ptn+=VectorClass::size()) {
+            VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+            VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
+            //load tip vector
+            if (!SITE_MODEL)
+            for (i = 0; i < VectorClass::size(); i++) {
+                double *this_tip_partial_lh;
+                if (ptn+i < orig_nptn)
+                    this_tip_partial_lh = tip_partial_lh + tip_block*(aln->at(ptn+i))[dad->id];
+                else if (ptn+i < max_orig_nptn)
+                    this_tip_partial_lh = tip_partial_lh + tip_block*aln->STATE_UNKNOWN;
+                else if (ptn+i < nptn)
+                    this_tip_partial_lh = tip_partial_lh + tip_block*model_factory->unobserved_ptns[ptn+i-max_orig_nptn];
+                else
+                    this_tip_partial_lh = tip_partial_lh + tip_block*aln->STATE_UNKNOWN;
+                double *this_vec_tip = vec_tip+i;
+                for (c = 0; c < tip_block; c++) {
+                    *this_vec_tip = this_tip_partial_lh[c];
+                    this_vec_tip += VectorClass::size();
+                }
+
+            }
+            VectorClass *lh_tip;
+            if (SITE_MODEL)
+                lh_tip = (VectorClass*)&tip_partial_lh_node[ptn*nstates];
+            for (c = 0; c < ncat_mix; c++) {
+                if (!SITE_MODEL)
+                    lh_tip = (VectorClass*)(vec_tip + mix_addr_nstates[c]*VectorClass::size());
+                for (i = 0; i < nstates; i++) {
+                    theta[i] = lh_tip[i] * partial_lh_dad[i];
+                }
+                partial_lh_dad += nstates;
+                theta += nstates;
+            }
+            if (SAFE_NUMERIC) {
+                // numerical scaling per category
+                UBYTE *scale_dad;
+                UBYTE min_scale;
+                for (i = 0; i < VectorClass::size(); i++) {
+                    scale_dad = dad_branch->scale_num+(ptn+i)*ncat_mix;
+                    min_scale = scale_dad[0];
+                    for (c = 1; c < ncat_mix; c++)
+                        min_scale = min(min_scale, scale_dad[c]);
+
+                    buffer_scale_all[ptn+i] = min_scale;
+
+                    for (c = 0; c < ncat_mix; c++) {
+                        if (scale_dad[c] == min_scale+1) {
+                            double *this_theta = &theta_all[ptn*block + c*nstates*VectorClass::size() + i];
+                            for (size_t x = 0; x < nstates; x++) {
+                                this_theta[x*VectorClass::size()] *= SCALING_THRESHOLD;
+                            }
+                        } else if (scale_dad[c] > min_scale+1) {
+                            double *this_theta = &theta_all[ptn*block + c*nstates*VectorClass::size() + i];
+                            for (size_t x = 0; x < nstates; x++) {
+                                this_theta[x*VectorClass::size()] = 0.0;
+                            }
+                        }
+                    }
+                }
+            } else {
+                // normal scaling
+                for (i = 0; i < VectorClass::size(); i++)
+                    buffer_scale_all[ptn+i] = dad_branch->scale_num[ptn+i];
+            }
+            VectorClass *buf = (VectorClass*)(buffer_scale_all+ptn);
+            *buf *= LOG_SCALING_THRESHOLD;
+
+        } // FOR PTN LOOP
+//            aligned_free(vec_tip);
+    } else {
+        //------- both dad and node are internal nodes  --------//
+
+        // now compute theta
+        for (ptn = ptn_lower; ptn < ptn_upper; ptn+=VectorClass::size()) {
+            VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
+            VectorClass *partial_lh_node = (VectorClass*)(node_branch->partial_lh + ptn*block);
+            VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+            for (i = 0; i < block; i++)
+                theta[i] = partial_lh_node[i] * partial_lh_dad[i];
+
+            if (SAFE_NUMERIC) {
+                // numerical scaling per category
+                UBYTE min_scale;
+                UBYTE sum_scale[ncat_mix];
+                size_t ptn_ncat = ptn*ncat_mix; 
+                UBYTE *scale_dad = dad_branch->scale_num + ptn_ncat;
+                UBYTE *scale_node = node_branch->scale_num + ptn_ncat;
+
+                for (i = 0; i < VectorClass::size(); i++) {
+                    min_scale = sum_scale[0] = scale_dad[0] + scale_node[0];
+                    for (c = 1; c < ncat_mix; c++) {
+                        sum_scale[c] = scale_dad[c] + scale_node[c];
+                        min_scale = min(min_scale, sum_scale[c]);
+                    }
+                    buffer_scale_all[ptn+i] = min_scale;
+
+                    for (c = 0; c < ncat_mix; c++) {
+                        if (sum_scale[c] == min_scale+1) {
+                            double *this_theta = &theta_all[ptn*block + c*nstates*VectorClass::size() + i];
+                            for (size_t x = 0; x < nstates; x++) {
+                                this_theta[x*VectorClass::size()] *= SCALING_THRESHOLD;
+                            }
+                        } else if (sum_scale[c] > min_scale+1) {
+                            double *this_theta = &theta_all[ptn*block + c*nstates*VectorClass::size() + i];
+                            for (size_t x = 0; x < nstates; x++) {
+                                this_theta[x*VectorClass::size()] = 0.0;
+                            }
+                        }
+                    }
+                    scale_dad += ncat_mix;
+                    scale_node += ncat_mix;
+                }
+            } else {
+                for (i = 0; i < VectorClass::size(); i++)
+                    buffer_scale_all[ptn+i] = dad_branch->scale_num[ptn+i] + node_branch->scale_num[ptn+i];
+            }
+            VectorClass *buf = (VectorClass*)(buffer_scale_all+ptn);
+            *buf *= LOG_SCALING_THRESHOLD;
+        } // FOR ptn
+    } // internal node
+}
+
+#ifdef KERNEL_FIX_STATES
+template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA, const bool SITE_MODEL>
+void PhyloTree::computeLikelihoodDervSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf)
+#else
+template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA, const bool SITE_MODEL>
+void PhyloTree::computeLikelihoodDervGenericSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf)
+#endif
+{
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    if (node->isLeaf()) {
+    	PhyloNode *tmp_node = dad;
+    	dad = node;
+    	node = tmp_node;
+    	PhyloNeighbor *tmp_nei = dad_branch;
+    	dad_branch = node_branch;
+    	node_branch = tmp_nei;
+    }
+
+#ifdef KERNEL_FIX_STATES
+    computeTraversalInfo<VectorClass, nstates>(node, dad, false);
+#else
+    computeTraversalInfo<VectorClass>(node, dad, false);
+#endif
+
+//
+//    if ((dad_branch->partial_lh_computed & 1) == 0)
+//        computePartialLikelihood(dad_branch, dad);
+//    if ((node_branch->partial_lh_computed & 1) == 0)
+//        computePartialLikelihood(node_branch, node);
+
+#ifndef KERNEL_FIX_STATES
+    size_t nstates = aln->num_states;
+#endif
+    size_t ncat = site_rate->getNRate();
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
+
+    size_t block = ncat_mix * nstates;
+//    size_t tip_block = nstates * model->getNMixtures();
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i;
+    size_t orig_nptn = aln->size();
+    size_t max_orig_nptn = ((orig_nptn+VectorClass::size()-1)/VectorClass::size())*VectorClass::size();
+    size_t nptn = max_orig_nptn+model_factory->unobserved_ptns.size();
+    bool isASC = model_factory->unobserved_ptns.size() > 0;
+
+
+
+    size_t mix_addr_nstates[ncat_mix], mix_addr[ncat_mix];
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+    for (c = 0; c < ncat_mix; c++) {
+        size_t m = c/denom;
+        mix_addr_nstates[c] = m*nstates;
+        mix_addr[c] = mix_addr_nstates[c]*nstates;
+    }
+
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+    double *buffer_partial_lh_ptr = buffer_partial_lh;
+    vector<size_t> limits;
+    computeBounds<VectorClass>(num_threads, nptn, limits);
+
+	assert(theta_all);
+
+    double *val0 = NULL;
+    double *val1 = NULL;
+    double *val2 = NULL;
+    double cat_rate[ncat];
+    double cat_prop[ncat];
+
+
+    if (SITE_MODEL) {
+        for (c = 0; c < ncat; c++) {
+            cat_rate[c] = site_rate->getRate(c);
+            cat_prop[c] = site_rate->getProp(c);
+        }
+    } else {
+        val0 = buffer_partial_lh_ptr;
+        val1 = val0 + get_safe_upper_limit(block);
+        val2 = val1 + get_safe_upper_limit(block);
+        if (nstates % VectorClass::size() == 0) {
+            VectorClass *vc_val0 = (VectorClass*)val0;
+            VectorClass *vc_val1 = (VectorClass*)val1;
+            VectorClass *vc_val2 = (VectorClass*)val2;
+
+            double len = dad_branch->length;
+            size_t loop_size = nstates/VectorClass::size();
+            for (c = 0; c < ncat_mix; c++) {
+                size_t m = c/denom;
+                VectorClass *eval_ptr = (VectorClass*)(eval + mix_addr_nstates[c]);
+                size_t mycat = c%ncat;
+                double prop = site_rate->getProp(mycat) * model->getMixtureWeight(m);
+                double myrate = site_rate->getRate(mycat);
+                for (i = 0; i < loop_size; i++) {
+                    VectorClass cof = eval_ptr[i] * myrate;
+                    VectorClass val = exp(cof*len) * prop;
+                    VectorClass val1_ = cof*val;
+                    vc_val0[i] = val;
+                    vc_val1[i] = val1_;
+                    vc_val2[i] = cof*val1_;
+                }
+                vc_val0 += loop_size;
+                vc_val1 += loop_size;
+                vc_val2 += loop_size;
+            }
+        } else {
+            for (c = 0; c < ncat_mix; c++) {
+                size_t m = c/denom;
+                double *eval_ptr = eval + mix_addr_nstates[c];
+                size_t mycat = c%ncat;
+                double prop = site_rate->getProp(mycat) * model->getMixtureWeight(m);
+                size_t addr = c*nstates;
+                for (i = 0; i < nstates; i++) {
+                    double cof = eval_ptr[i]*site_rate->getRate(mycat);
+                    double val = exp(cof*dad_branch->length) * prop;
+                    double val1_ = cof*val;
+                    val0[addr+i] = val;
+                    val1[addr+i] = val1_;
+                    val2[addr+i] = cof*val1_;
+                }
+            }
+        }
+    }
+
+    double dad_length = dad_branch->length;
+
+    VectorClass all_df = 0.0, all_ddf = 0.0, all_prob_const = 0.0, all_df_const = 0.0, all_ddf_const = 0.0;
+//    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 1) private(ptn, i, c) num_threads(num_threads)
+#endif
+    for (int thread_id = 0; thread_id < num_threads; thread_id++) {
+        VectorClass my_df(0.0), my_ddf(0.0), vc_prob_const(0.0), vc_df_const(0.0), vc_ddf_const(0.0);
+        size_t ptn_lower = limits[thread_id];
+        size_t ptn_upper = limits[thread_id+1];
+
+        if (!theta_computed)
+        #ifdef KERNEL_FIX_STATES
+            computeLikelihoodBufferSIMD<VectorClass, SAFE_NUMERIC, nstates, FMA, SITE_MODEL>(dad_branch, dad, ptn_lower, ptn_upper, thread_id);
+        #else
+            computeLikelihoodBufferGenericSIMD<VectorClass, SAFE_NUMERIC, FMA, SITE_MODEL>(dad_branch, dad, ptn_lower, ptn_upper, thread_id);
+        #endif
+        
+        for (ptn = ptn_lower; ptn < ptn_upper; ptn+=VectorClass::size()) {
+            VectorClass lh_ptn;
+            //lh_ptn.load_a(&ptn_invar[ptn]);
+            VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
+            VectorClass df_ptn, ddf_ptn;
+
+            if (SITE_MODEL) {
+                VectorClass* eval_ptr = (VectorClass*) &eval[ptn*nstates];
+                lh_ptn = 0.0; df_ptn = 0.0; ddf_ptn = 0.0;
+                for (c = 0; c < ncat; c++) {
+                    VectorClass lh_cat(0.0), df_cat(0.0), ddf_cat(0.0);
+                    for (i = 0; i < nstates; i++) {
+                        VectorClass cof = eval_ptr[i] * cat_rate[c];
+                        VectorClass val = exp(cof*dad_length)*theta[i];
+                        VectorClass val1 = cof*val;
+                        lh_cat += val;
+                        df_cat += val1;
+                        ddf_cat = mul_add(cof, val1, ddf_cat);
+                    }
+                    lh_ptn = mul_add(cat_prop[c], lh_cat, lh_ptn);
+                    df_ptn = mul_add(cat_prop[c], df_cat, df_ptn);
+                    ddf_ptn = mul_add(cat_prop[c], ddf_cat, ddf_ptn);
+                    theta += nstates;
+
+                }
+            } else {
+        #ifdef KERNEL_FIX_STATES
+                dotProductTriple<VectorClass, double, nstates, FMA>(val0, val1, val2, theta, lh_ptn, df_ptn, ddf_ptn, block);
+        #else
+                dotProductTriple<VectorClass, double, FMA>(val0, val1, val2, theta, lh_ptn, df_ptn, ddf_ptn, block, nstates);
+        #endif
+            }
+            lh_ptn = abs(lh_ptn + VectorClass().load_a(&ptn_invar[ptn]));
+            
+            if (ptn < orig_nptn) {
+                lh_ptn = 1.0 / lh_ptn;
+                VectorClass df_frac = df_ptn * lh_ptn;
+                VectorClass ddf_frac = ddf_ptn * lh_ptn;
+                VectorClass freq;
+                freq.load_a(&ptn_freq[ptn]);
+                VectorClass tmp1 = df_frac * freq;
+                VectorClass tmp2 = ddf_frac * freq;
+                my_df += tmp1;
+                my_ddf += nmul_add(tmp1, df_frac, tmp2);
+            } else {
+                // ascertainment bias correction
+                if (ptn+VectorClass::size() > nptn) {
+                    // cutoff the last entries if going beyond
+                    lh_ptn.cutoff(nptn-ptn);
+                    df_ptn.cutoff(nptn-ptn);
+                    ddf_ptn.cutoff(nptn-ptn);
+                }
+                if (horizontal_or(VectorClass().load_a(&buffer_scale_all[ptn]) != 0.0)) {
+                    // some entries are rescaled
+                    double *lh_ptn_dbl = (double*)&lh_ptn;
+                    double *df_ptn_dbl = (double*)&df_ptn;
+                    double *ddf_ptn_dbl = (double*)&ddf_ptn;
+                    for (i = 0; i < VectorClass::size(); i++)
+                        if (buffer_scale_all[ptn+i] != 0.0) {
+                            lh_ptn_dbl[i] *= SCALING_THRESHOLD;
+                            df_ptn_dbl[i] *= SCALING_THRESHOLD;
+                            ddf_ptn_dbl[i] *= SCALING_THRESHOLD;
+                        }
+                }
+
+                vc_prob_const += lh_ptn;
+                vc_df_const += df_ptn;
+                vc_ddf_const += ddf_ptn;
+            }
+        } // FOR ptn
+    #ifdef _OPENMP
+    #pragma omp critical
+    #endif
+        {
+            all_df += my_df;
+            all_ddf += my_ddf;
+            if (isASC) {
+                all_prob_const += vc_prob_const;
+                all_df_const += vc_df_const;
+                all_ddf_const += vc_ddf_const;
+            }
+        }
+    } // FOR thread
+
+    // mark buffer as computed
+    theta_computed = true;
+
+	df = horizontal_add(all_df);
+	ddf = horizontal_add(all_ddf);
+
+    if (!SAFE_NUMERIC && (std::isnan(df) || std::isinf(df)))
+        outError("Numerical underflow (lh-derivative). Run again with the safe likelihood kernel via `-safe` option");
+
+	if (isASC) {
+        double prob_const = 0.0, df_const = 0.0, ddf_const = 0.0;
+        prob_const = horizontal_add(all_prob_const);
+        df_const = horizontal_add(all_df_const);
+        ddf_const = horizontal_add(all_ddf_const);
+    	// ascertainment bias correction
+    	prob_const = 1.0 - prob_const;
+    	double df_frac = df_const / prob_const;
+    	double ddf_frac = ddf_const / prob_const;
+    	int nsites = aln->getNSite();
+    	df += nsites * df_frac;
+    	ddf += nsites *(ddf_frac + df_frac*df_frac);
+    }
+
+    if (std::isnan(df) || std::isinf(df)) {
+        cerr << "WARNING: Numerical underflow for lh-derivative" << endl;
+        df = ddf = 0.0;
+    }
+}
+
+
+
+
+/*******************************************************
+ *
+ * NEW! highly-vectorized log-likelihood function
+ *
+ ******************************************************/
+
+#ifdef KERNEL_FIX_STATES
+template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA, const bool SITE_MODEL>
+double PhyloTree::computeLikelihoodBranchSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad)
+#else
+template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA, const bool SITE_MODEL>
+double PhyloTree::computeLikelihoodBranchGenericSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad)
+#endif
+{
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    if (node->isLeaf()) {
+    	PhyloNode *tmp_node = dad;
+    	dad = node;
+    	node = tmp_node;
+    	PhyloNeighbor *tmp_nei = dad_branch;
+    	dad_branch = node_branch;
+    	node_branch = tmp_nei;
+    }
+
+#ifdef KERNEL_FIX_STATES
+    computeTraversalInfo<VectorClass, nstates>(node, dad, false);
+#else
+    computeTraversalInfo<VectorClass>(node, dad, false);
+#endif
+//    if ((dad_branch->partial_lh_computed & 1) == 0)
+//        computePartialLikelihood(dad_branch, dad);
+//    if ((node_branch->partial_lh_computed & 1) == 0)
+//        computePartialLikelihood(node_branch, node);
+//    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+    double tree_lh = 0.0;
+#ifndef KERNEL_FIX_STATES
+    size_t nstates = aln->num_states;
+#endif
+    size_t ncat = site_rate->getNRate();
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
+
+    size_t block = ncat_mix * nstates;
+    size_t tip_block = nstates * model->getNMixtures();
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i;
+    size_t orig_nptn = aln->size();
+    size_t max_orig_nptn = ((orig_nptn+VectorClass::size()-1)/VectorClass::size())*VectorClass::size();
+    size_t nptn = max_orig_nptn+model_factory->unobserved_ptns.size();
+    size_t tip_mem_size = max_orig_nptn * nstates;
+    bool isASC = model_factory->unobserved_ptns.size() > 0;
+
+    size_t mix_addr_nstates[ncat_mix], mix_addr[ncat_mix];
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+//    double *val = aligned_alloc<double>(block);
+    double *val = NULL;
+    double *buffer_partial_lh_ptr = buffer_partial_lh;
+
+
+    double cat_length[ncat];
+    double cat_prop[ncat];
+    if (SITE_MODEL) {
+        for (c = 0; c < ncat; c++) {
+            cat_length[c] = site_rate->getRate(c) * dad_branch->length;
+            cat_prop[c] = site_rate->getProp(c);
+        }
+    } else {
+        val = buffer_partial_lh_ptr;
+        buffer_partial_lh_ptr += get_safe_upper_limit(block);
+        if (nstates % VectorClass::size() == 0) {
+            size_t loop_size = nstates / VectorClass::size();
+            for (c = 0; c < ncat_mix; c++) {
+                size_t mycat = c%ncat;
+                size_t m = c/denom;
+                mix_addr_nstates[c] = m*nstates;
+                mix_addr[c] = mix_addr_nstates[c]*nstates;
+                VectorClass *eval_ptr = (VectorClass*)(eval + mix_addr_nstates[c]);
+                double len = site_rate->getRate(mycat)*dad_branch->length;
+                double prop = site_rate->getProp(mycat) * model->getMixtureWeight(m);
+                VectorClass *this_val = (VectorClass*)(val + c*nstates);
+                for (i = 0; i < loop_size; i++)
+                    this_val[i] = exp(eval_ptr[i]*len) * prop;
+            }
+        } else {
+            for (c = 0; c < ncat_mix; c++) {
+                size_t mycat = c%ncat;
+                size_t m = c/denom;
+                mix_addr_nstates[c] = m*nstates;
+                mix_addr[c] = mix_addr_nstates[c]*nstates;
+                double *eval_ptr = eval + mix_addr_nstates[c];
+                double len = site_rate->getRate(mycat)*dad_branch->length;
+                double prop = site_rate->getProp(mycat) * model->getMixtureWeight(m);
+                double *this_val = val + c*nstates;
+                for (i = 0; i < nstates; i++)
+                    this_val[i] = exp(eval_ptr[i]*len) * prop;
+            }
+        }
+    }
+
+    VectorClass all_tree_lh(0.0);
+    VectorClass all_prob_const(0.0);
+
+    vector<size_t> limits;
+    computeBounds<VectorClass>(num_threads, nptn, limits);
+
+    if (dad->isLeaf()) {
+    	// special treatment for TIP-INTERNAL NODE case
+//    	double *partial_lh_node = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+        double *partial_lh_node;
+        if (SITE_MODEL)
+            partial_lh_node = &tip_partial_lh[dad->id * tip_mem_size];
+        else {
+            partial_lh_node = buffer_partial_lh_ptr;
+            buffer_partial_lh_ptr += get_safe_upper_limit((aln->STATE_UNKNOWN+1)*block);
+        }
+
+        if (!SITE_MODEL) {
+            IntVector states_dad = aln->seq_states[dad->id];
+            states_dad.push_back(aln->STATE_UNKNOWN);
+            // precompute information from one tip
+            if (nstates % VectorClass::size() == 0) {
+                // vectorized version
+                for (IntVector::iterator it = states_dad.begin(); it != states_dad.end(); it++) {
+                    double *lh_node = partial_lh_node + (*it)*block;
+                    double *lh_tip = tip_partial_lh + (*it)*tip_block;
+                    double *vc_val_tmp = val;
+                    for (c = 0; c < ncat_mix; c++) {
+                        double *this_lh_tip = lh_tip + mix_addr_nstates[c];
+                        for (i = 0; i < nstates; i+=VectorClass::size()) {
+                            (VectorClass().load_a(&vc_val_tmp[i]) * VectorClass().load_a(&this_lh_tip[i])).store_a(&lh_node[i]);
+                        }
+                        lh_node += nstates;
+                        vc_val_tmp += nstates;
+                    }
+                }
+            } else {
+                // non-vectorized version
+                for (IntVector::iterator it = states_dad.begin(); it != states_dad.end(); it++) {
+                    double *lh_node = partial_lh_node +(*it)*block;
+                    double *val_tmp = val;
+                    double *this_tip_partial_lh = tip_partial_lh + (*it)*tip_block;
+                    for (c = 0; c < ncat_mix; c++) {
+                        double *lh_tip = this_tip_partial_lh + mix_addr_nstates[c];
+                        for (i = 0; i < nstates; i++) {
+                              lh_node[i] = val_tmp[i] * lh_tip[i];
+                        }
+                        lh_node += nstates;
+                        val_tmp += nstates;
+                    }
+                }
+            }
+        }
+
+    	// now do the real computation
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, i, c) schedule(static, 1) num_threads(num_threads)
+#endif
+        for (int thread_id = 0; thread_id < num_threads; thread_id++) {
+
+            VectorClass vc_tree_lh(0.0), vc_prob_const(0.0);
+
+            size_t ptn_lower = limits[thread_id];
+            size_t ptn_upper = limits[thread_id+1];
+
+            // reset memory for _pattern_lh_cat
+            memset(_pattern_lh_cat + ptn_lower*ncat_mix, 0, sizeof(double)*(ptn_upper-ptn_lower)*ncat_mix);
+
+            // first compute partial_lh
+            for (vector<TraversalInfo>::iterator it = traversal_info.begin(); it != traversal_info.end(); it++)
+                computePartialLikelihood(*it, ptn_lower, ptn_upper, thread_id);
+
+            double *vec_tip = buffer_partial_lh_ptr + block*VectorClass::size()*thread_id;
+
+            for (ptn = ptn_lower; ptn < ptn_upper; ptn+=VectorClass::size()) {
+                VectorClass lh_ptn;
+                lh_ptn.load_a(&ptn_invar[ptn]);
+                VectorClass *lh_cat = (VectorClass*)(_pattern_lh_cat + ptn*ncat_mix);
+                VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+                VectorClass *lh_node = SITE_MODEL ? (VectorClass*)&partial_lh_node[ptn*nstates] : (VectorClass*)vec_tip;
+
+                if (SITE_MODEL) {
+                    // site-specific model
+                    VectorClass* eval_ptr = (VectorClass*) &eval[ptn*nstates];
+                    for (c = 0; c < ncat; c++) {
+    #ifdef KERNEL_FIX_STATES
+                        dotProductExp<VectorClass, double, nstates, FMA>(eval_ptr, lh_node, partial_lh_dad, cat_length[c], lh_cat[c]);
+    #else
+                        dotProductExp<VectorClass, double, FMA>(eval_ptr, lh_node, partial_lh_dad, cat_length[c], lh_cat[c], nstates);
+    #endif
+                        if (SAFE_NUMERIC)
+                            lh_cat[c] *= cat_prop[c];
+                        else
+                            lh_ptn += (lh_cat[c] *= cat_prop[c]);
+
+                        partial_lh_dad += nstates;
+                    }
+                } else { // normal model
+                    //load tip vector
+                    for (i = 0; i < VectorClass::size(); i++) {
+                        double *lh_tip;
+                        if (ptn+i < orig_nptn)
+                            lh_tip = partial_lh_node + block*(aln->at(ptn+i))[dad->id];
+                        else if (ptn+i < max_orig_nptn)
+                            lh_tip = partial_lh_node + block*aln->STATE_UNKNOWN;
+                        else if (ptn+i < nptn)
+                            lh_tip = partial_lh_node + block*model_factory->unobserved_ptns[ptn+i-max_orig_nptn];
+                        else
+                            lh_tip = partial_lh_node + block*aln->STATE_UNKNOWN;
+
+                        double *this_vec_tip = vec_tip+i;
+                        for (c = 0; c < block; c++) {
+                            *this_vec_tip = lh_tip[c];
+                            this_vec_tip += VectorClass::size();
+                        }
+
+                    }
+                    // compute likelihood per category
+                    for (c = 0; c < ncat_mix; c++) {
+    #ifdef KERNEL_FIX_STATES
+                        dotProductVec<VectorClass, VectorClass, nstates, FMA>(lh_node, partial_lh_dad, lh_cat[c]);
+    #else
+                        dotProductVec<VectorClass, VectorClass, FMA>(lh_node, partial_lh_dad, lh_cat[c], nstates);
+    #endif
+                        if (!SAFE_NUMERIC)
+                            lh_ptn += lh_cat[c];
+                        lh_node += nstates;
+                        partial_lh_dad += nstates;
+                    }
+                } // if SITE_MODEL
+
+                // compute scaling factor per pattern
+                VectorClass vc_min_scale(0.0);
+                double* vc_min_scale_ptr = (double*)&vc_min_scale;
+                if (SAFE_NUMERIC) {
+                    // numerical scaling per category
+                    UBYTE *scale_dad = dad_branch->scale_num + ptn*ncat_mix;
+                    UBYTE min_scale;
+                    for (i = 0; i < VectorClass::size(); i++) {
+    //                    scale_dad = dad_branch->scale_num+(ptn+i)*ncat_mix;
+                        min_scale = scale_dad[0];
+                        for (c = 1; c < ncat_mix; c++)
+                            min_scale = min(min_scale, scale_dad[c]);
+
+                        vc_min_scale_ptr[i] = min_scale;
+
+                        double *this_lh_cat = &_pattern_lh_cat[ptn*ncat_mix + i];
+                        for (c = 0; c < ncat_mix; c++) {
+                            // rescale lh_cat if neccessary
+                            if (scale_dad[c] == min_scale+1) {
+                                this_lh_cat[c*VectorClass::size()] *= SCALING_THRESHOLD;
+                            } else if (scale_dad[c] > min_scale+1) {
+                                this_lh_cat[c*VectorClass::size()] = 0.0;
+                            }
+                        }
+                        scale_dad += ncat_mix;
+                    }
+                    // now take the sum of (rescaled) lh_cat
+                    sumVec<VectorClass, true>(lh_cat, lh_ptn, ncat_mix);
+
+                } else {
+                    for (i = 0; i < VectorClass::size(); i++) {
+                        vc_min_scale_ptr[i] = dad_branch->scale_num[ptn+i];
+                    }
+                }
+                vc_min_scale *= LOG_SCALING_THRESHOLD;
+
+                lh_ptn = abs(lh_ptn);
+                if (ptn < orig_nptn) {
+                    lh_ptn = log(lh_ptn) + vc_min_scale;
+                    lh_ptn.store_a(&_pattern_lh[ptn]);
+                    vc_tree_lh = mul_add(lh_ptn, VectorClass().load_a(&ptn_freq[ptn]), vc_tree_lh);
+                } else {
+                    // ascertainment bias correction
+                    if (ptn+VectorClass::size() > nptn) {
+                        // cutoff the last entries if going beyond
+                        lh_ptn.cutoff(nptn-ptn);
+                    }
+                    // bugfix 2016-01-21, prob_const can be rescaled
+                    if (horizontal_or(vc_min_scale != 0.0)) {
+                        // some entries are rescaled
+                        double *lh_ptn_dbl = (double*)&lh_ptn;
+                        for (i = 0; i < VectorClass::size(); i++)
+                            if (vc_min_scale_ptr[i] != 0.0)
+                                lh_ptn_dbl[i] *= SCALING_THRESHOLD;
+                    }
+                    vc_prob_const += lh_ptn;
+                }
+            } // FOR PTN
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+            {
+                all_tree_lh += vc_tree_lh;
+                if (isASC)
+                    all_prob_const += vc_prob_const;
+            }
+        } // FOR thread
+
+    } else {
+
+//        assert(0 && "Don't compute tree log-likelihood from internal branch!");
+    	//-------- both dad and node are internal nodes -----------/
+
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, i, c) schedule(static, 1) num_threads(num_threads)
+#endif
+        for (int thread_id = 0; thread_id < num_threads; thread_id++) {
+
+            size_t ptn_lower = limits[thread_id];
+            size_t ptn_upper = limits[thread_id+1];
+
+            VectorClass vc_tree_lh(0.0), vc_prob_const(0.0);
+
+            // reset memory for _pattern_lh_cat
+            memset(_pattern_lh_cat + ptn_lower*ncat_mix, 0, sizeof(double)*(ptn_upper-ptn_lower)*ncat_mix);
+
+            // first compute partial_lh
+            for (vector<TraversalInfo>::iterator it = traversal_info.begin(); it != traversal_info.end(); it++)
+                computePartialLikelihood(*it, ptn_lower, ptn_upper, thread_id);
+
+            for (ptn = ptn_lower; ptn < ptn_upper; ptn+=VectorClass::size()) {
+                VectorClass lh_ptn;
+                lh_ptn.load_a(&ptn_invar[ptn]);
+                VectorClass *lh_cat = (VectorClass*)(_pattern_lh_cat + ptn*ncat_mix);
+                VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
+                VectorClass *partial_lh_node = (VectorClass*)(node_branch->partial_lh + ptn*block);
+
+                // compute likelihood per category
+                if (SITE_MODEL) {
+                    VectorClass* eval_ptr = (VectorClass*) &eval[ptn*nstates];
+                    for (c = 0; c < ncat; c++) {
+    #ifdef KERNEL_FIX_STATES
+                        dotProductExp<VectorClass, double, nstates, FMA>(eval_ptr, partial_lh_node, partial_lh_dad, cat_length[c], lh_cat[c]);
+    #else
+                        dotProductExp<VectorClass, double, FMA>(eval_ptr, partial_lh_node, partial_lh_dad, cat_length[c], lh_cat[c], nstates);
+    #endif
+                        if (SAFE_NUMERIC)
+                            lh_cat[c] *= cat_prop[c];
+                        else
+                            lh_ptn += (lh_cat[c] *= cat_prop[c]);
+                        partial_lh_node += nstates;
+                        partial_lh_dad += nstates;
+                    }
+                } else {
+                    double *val_tmp = val;
+                    for (c = 0; c < ncat_mix; c++) {
+    #ifdef KERNEL_FIX_STATES
+                        dotProduct3Vec<VectorClass, double, nstates, FMA>(val_tmp, partial_lh_node, partial_lh_dad, lh_cat[c]);
+    #else
+                        dotProduct3Vec<VectorClass, double, FMA>(val_tmp, partial_lh_node, partial_lh_dad, lh_cat[c], nstates);
+    #endif
+                        if (!SAFE_NUMERIC)
+                            lh_ptn += lh_cat[c];
+                        partial_lh_node += nstates;
+                        partial_lh_dad += nstates;
+                        val_tmp += nstates;
+                    }
+                } // if SITE MODEL
+
+
+                // compute the scaling factor per pattern
+                VectorClass vc_min_scale(0.0);
+                double* vc_min_scale_ptr = (double*)&vc_min_scale;
+                if (SAFE_NUMERIC) {
+                    UBYTE *scale_dad = dad_branch->scale_num + ptn*ncat_mix;
+                    UBYTE *scale_node = node_branch->scale_num + ptn*ncat_mix;
+                    UBYTE sum_scale[ncat_mix];
+                    UBYTE min_scale;
+
+                    for (i = 0; i < VectorClass::size(); i++) {
+                        min_scale = sum_scale[0] = scale_dad[0] + scale_node[0];
+                        for (c = 1; c < ncat_mix; c++) {
+                            sum_scale[c] = scale_dad[c] + scale_node[c];
+                            min_scale = min(min_scale, sum_scale[c]);
+                        }
+                        vc_min_scale_ptr[i] = min_scale;
+                        double *this_lh_cat = &_pattern_lh_cat[ptn*ncat_mix + i];
+                        for (c = 0; c < ncat_mix; c++) {
+                            if (sum_scale[c] == min_scale+1) {
+                                this_lh_cat[c*VectorClass::size()] *= SCALING_THRESHOLD;
+                            } else if (sum_scale[c] > min_scale+1) {
+                                // reset if category is scaled a lot
+                                this_lh_cat[c*VectorClass::size()] = 0.0;
+                            }
+                        }
+                        scale_dad += ncat_mix;
+                        scale_node += ncat_mix;
+                    }
+                    sumVec<VectorClass, true>(lh_cat, lh_ptn, ncat_mix);
+                } else {
+                    for (i = 0; i < VectorClass::size(); i++) {
+                        vc_min_scale_ptr[i] = dad_branch->scale_num[ptn+i] + node_branch->scale_num[ptn+i];
+                    }
+                } // if SAFE_NUMERIC
+                vc_min_scale *= LOG_SCALING_THRESHOLD;
+
+                lh_ptn = abs(lh_ptn);
+
+                if (ptn < orig_nptn) {
+                    lh_ptn = log(lh_ptn) + vc_min_scale;
+                    lh_ptn.store_a(&_pattern_lh[ptn]);
+                    vc_tree_lh = mul_add(lh_ptn, VectorClass().load_a(&ptn_freq[ptn]), vc_tree_lh);
+                } else {
+                    // ascertainment bias correction
+                    if (ptn+VectorClass::size() > nptn) {
+                        // cutoff the last entries if going beyond
+                        lh_ptn.cutoff(nptn-ptn);
+                    }
+                    // bugfix 2016-01-21, prob_const can be rescaled
+                    if (horizontal_or(vc_min_scale != 0.0)) {
+                        // some entries are rescaled
+                        double *lh_ptn_dbl = (double*)&lh_ptn;
+                        for (i = 0; i < VectorClass::size(); i++)
+                            if (vc_min_scale_ptr[i] != 0.0)
+                                lh_ptn_dbl[i] *= SCALING_THRESHOLD;
+                    }
+                    vc_prob_const += lh_ptn;
+                }
+            } // FOR LOOP ptn
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+            {
+                all_tree_lh += vc_tree_lh;
+                if (isASC)
+                    all_prob_const += vc_prob_const;
+            }
+        } // FOR thread
+    } // else
+
+    tree_lh += horizontal_add(all_tree_lh);
+
+    if (!SAFE_NUMERIC && (std::isnan(tree_lh) || std::isinf(tree_lh)))
+        outError("Numerical underflow (lh-branch). Run again with the safe likelihood kernel via `-safe` option");
+
+    assert(!std::isnan(tree_lh) && !std::isinf(tree_lh) && "Numerical underflow for lh-branch");
+
+    if (isASC) {
+    	// ascertainment bias correction
+        double prob_const = horizontal_add(all_prob_const);
+        if (prob_const >= 1.0 || prob_const < 0.0) {
+            printTree(cout, WT_TAXON_ID + WT_BR_LEN + WT_NEWLINE);
+            model->writeInfo(cout);
+        }
+        assert(prob_const < 1.0 && prob_const >= 0.0);
+
+        // BQM 2015-10-11: fix this those functions using _pattern_lh_cat
+//        double inv_const = 1.0 / (1.0-prob_const);
+//        size_t nptn_cat = orig_nptn*ncat;
+//    	for (ptn = 0; ptn < nptn_cat; ptn++)
+//            _pattern_lh_cat[ptn] *= inv_const;
+        
+    	prob_const = log(1.0 - prob_const);
+    	for (ptn = 0; ptn < orig_nptn; ptn+=VectorClass::size())
+            (VectorClass().load_a(&_pattern_lh[ptn])-prob_const).store_a(&_pattern_lh[ptn]);
+//    		_pattern_lh[ptn] -= prob_const;
+    	tree_lh -= aln->getNSite()*prob_const;
+		assert(!std::isnan(tree_lh) && !std::isinf(tree_lh));
+    }
+
+    return tree_lh;
+}
+
+
+/*******************************************************
+ *
+ * NEW! highly-vectorized log-likelihood from buffer
+ *
+ ******************************************************/
+
+#ifdef KERNEL_FIX_STATES
+template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA, const bool SITE_MODEL>
+double PhyloTree::computeLikelihoodFromBufferSIMD()
+#else
+template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA, const bool SITE_MODEL>
+double PhyloTree::computeLikelihoodFromBufferGenericSIMD()
+#endif
+{
+
+	assert(theta_all && theta_computed);
+
+//	double tree_lh = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
+
+#ifndef KERNEL_FIX_STATES
+    size_t nstates = aln->num_states;
+#endif
+    size_t ncat = site_rate->getNRate();
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
+
+    size_t block = ncat_mix * nstates;
+//    size_t tip_block = nstates * model->getNMixtures();
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i;
+    size_t orig_nptn = aln->size();
+    size_t max_orig_nptn = ((orig_nptn+VectorClass::size()-1)/VectorClass::size())*VectorClass::size();
+    size_t nptn = max_orig_nptn+model_factory->unobserved_ptns.size();
+    bool isASC = model_factory->unobserved_ptns.size() > 0;
+
+    size_t mix_addr_nstates[ncat_mix], mix_addr[ncat_mix];
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+    for (c = 0; c < ncat_mix; c++) {
+        size_t m = c/denom;
+        mix_addr_nstates[c] = m*nstates;
+        mix_addr[c] = mix_addr_nstates[c]*nstates;
+    }
+
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+    double *val0 = NULL;
+    double cat_length[ncat];
+    double cat_prop[ncat];
+
+    if (SITE_MODEL) {
+        for (c = 0; c < ncat; c++) {
+            cat_length[c] = site_rate->getRate(c) * current_it->length;
+            cat_prop[c] = site_rate->getProp(c);
+        }
+    } else {
+        val0 = buffer_partial_lh;
+        if (nstates % VectorClass::size() == 0) {
+            VectorClass *vc_val0 = (VectorClass*)val0;
+            size_t loop_size = nstates / VectorClass::size();
+            for (c = 0; c < ncat_mix; c++) {
+                size_t m = c/denom;
+                VectorClass *eval_ptr = (VectorClass*)(eval + mix_addr_nstates[c]);
+                size_t mycat = c%ncat;
+                double prop = site_rate->getProp(mycat) * model->getMixtureWeight(m);
+                double len = site_rate->getRate(mycat) * current_it->length;
+                for (i = 0; i < loop_size; i++) {
+                    vc_val0[i] = exp(eval_ptr[i] * len) * prop;
+                }
+                vc_val0 += loop_size;
+            }
+        } else {
+            for (c = 0; c < ncat_mix; c++) {
+                size_t m = c/denom;
+                double *eval_ptr = eval + mix_addr_nstates[c];
+                size_t mycat = c%ncat;
+                double prop = site_rate->getProp(mycat) * model->getMixtureWeight(m);
+                size_t addr = c*nstates;
+                for (i = 0; i < nstates; i++) {
+                    double cof = eval_ptr[i]*site_rate->getRate(mycat);
+                    double val = exp(cof*current_it->length) * prop;
+                    val0[addr+i] = val;
+                }
+            }
+        }
+    }
+
+//    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+
+    VectorClass all_tree_lh(0.0), all_prob_const(0.0);
+
+#ifdef _OPENMP
+#pragma omp parallel private(ptn, i, c) num_threads(num_threads)
+    {
+#endif
+        VectorClass vc_tree_lh(0.0), vc_prob_const(0.0);
+#ifdef _OPENMP
+#pragma omp for schedule(static) nowait
+#endif
+    for (ptn = 0; ptn < nptn; ptn+=VectorClass::size()) {
+		VectorClass lh_ptn;
+		VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
+        if (SITE_MODEL) {
+            VectorClass *eval_ptr = (VectorClass*)&eval[ptn*nstates];
+            lh_ptn.load_a(&ptn_invar[ptn]);
+            for (c = 0; c < ncat; c++) {
+                VectorClass lh_cat;
+#ifdef KERNEL_FIX_STATES
+                dotProductExp<VectorClass, double, nstates, FMA>(eval_ptr, theta, cat_length[c], lh_cat);
+#else
+                dotProductExp<VectorClass, double, FMA>(eval_ptr, theta, cat_length[c], lh_cat, nstates);
+#endif
+                lh_ptn = mul_add(lh_cat, cat_prop[c], lh_ptn);
+                theta += nstates;
+            }
+        } else {
+            dotProductVec<VectorClass, double, FMA>(val0, theta, lh_ptn, block);
+            lh_ptn += VectorClass().load_a(&ptn_invar[ptn]);
+        }
+
+        if (ptn < orig_nptn) {
+            lh_ptn = log(abs(lh_ptn)) + VectorClass().load_a(&buffer_scale_all[ptn]);
+            lh_ptn.store_a(&_pattern_lh[ptn]);
+            vc_tree_lh = mul_add(lh_ptn, VectorClass().load_a(&ptn_freq[ptn]), vc_tree_lh);
+        } else {
+            // bugfix 2016-01-21, prob_const can be rescaled
+//                if (min_scale >= 1)
+//                    lh_ptn *= SCALING_THRESHOLD;
+//				_pattern_lh[ptn] = lh_ptn;
+			// ascertainment bias correction
+            if (ptn+VectorClass::size() > nptn) {
+                // cutoff the last entries if going beyond
+                lh_ptn.cutoff(nptn-ptn);
+            }
+            if (horizontal_or(VectorClass().load_a(&buffer_scale_all[ptn]) != 0.0)) {
+                // some entries are rescaled
+                double *lh_ptn_dbl = (double*)&lh_ptn;
+                for (i = 0; i < VectorClass::size(); i++)
+                    if (buffer_scale_all[ptn+i] != 0.0)
+                        lh_ptn_dbl[i] *= SCALING_THRESHOLD;
+            }
+            vc_prob_const += lh_ptn;
+        }
+    }
+#ifdef _OPENMP
+#pragma omp critical
+        {
+            all_tree_lh += vc_tree_lh;
+            if (isASC)
+                all_prob_const += vc_prob_const;
+        }
+    }
+#else
+    all_tree_lh = vc_tree_lh;
+    all_prob_const = vc_prob_const;
+#endif
+
+    double tree_lh = horizontal_add(all_tree_lh);
+
+    if (!SAFE_NUMERIC && (std::isnan(tree_lh) || std::isinf(tree_lh)))
+        outError("Numerical underflow (lh-from-buffer). Run again with the safe likelihood kernel via `-safe` option");
+
+    assert(!std::isnan(tree_lh) && !std::isinf(tree_lh) && "Numerical underflow for lh-from-buffer");
+
+    if (isASC) {
+    	// ascertainment bias correction
+        double prob_const = horizontal_add(all_prob_const);
+        if (prob_const >= 1.0 || prob_const < 0.0) {
+            printTree(cout, WT_TAXON_ID + WT_BR_LEN + WT_NEWLINE);
+            model->writeInfo(cout);
+        }
+        assert(prob_const < 1.0 && prob_const >= 0.0);
+
+        // BQM 2015-10-11: fix this those functions using _pattern_lh_cat
+//        double inv_const = 1.0 / (1.0-prob_const);
+//        size_t nptn_cat = orig_nptn*ncat;
+//    	for (ptn = 0; ptn < nptn_cat; ptn++)
+//            _pattern_lh_cat[ptn] *= inv_const;
+        
+    	prob_const = log(1.0 - prob_const);
+    	for (ptn = 0; ptn < orig_nptn; ptn+=VectorClass::size())
+            (VectorClass().load_a(&_pattern_lh[ptn])-prob_const).store_a(&_pattern_lh[ptn]);
+//    		_pattern_lh[ptn] -= prob_const;
+    	tree_lh -= aln->getNSite()*prob_const;
+		assert(!std::isnan(tree_lh) && !std::isinf(tree_lh));
+    }
+
+    return tree_lh;
+}
+
+
+#endif //PHYLOKERNELNEW_H_
diff --git a/phylokernel.h b/phylokernelsafe.h
similarity index 52%
copy from phylokernel.h
copy to phylokernelsafe.h
index e0b9f7c..b378fbb 100644
--- a/phylokernel.h
+++ b/phylokernelsafe.h
@@ -1,17 +1,21 @@
 /*
- * phylokernel.h
+ * phylokernelsafe.h
+ * Safe likelihood kernel that scales likelihood per category
  *
- *  Created on: Dec 14, 2014
+ *  Created on: Sept 23, 2016
  *      Author: minh
  */
 
-#ifndef PHYLOKERNEL_H_
-#define PHYLOKERNEL_H_
+#ifndef PHYLOKERNELSAFE_H_
+#define PHYLOKERNELSAFE_H_
 
 #include "phylotree.h"
-#include "vectorclass/vectorclass.h"
-#include "vectorclass/vectormath_exp.h"
+//#include "vectorclass/vectorclass.h"
+//#include "vectorclass/vectormath_exp.h"
+#include "superalignment.h"
 
+
+#ifdef __SSE__
 inline Vec2d horizontal_add(Vec2d x[2]) {
 #if  INSTRSET >= 3  // SSE3
     return _mm_hadd_pd(x[0],x[1]);
@@ -20,7 +24,7 @@ inline Vec2d horizontal_add(Vec2d x[2]) {
     Vec2d help1 = _mm_shuffle_pd(x[0], x[1], _MM_SHUFFLE2(1,1));
     return _mm_add_pd(help0, help1);
 #else
-#error "You must compile with SSE3 enabled!"
+#error "You must compile with SSE2 enabled!"
 #endif
 }
 
@@ -30,6 +34,8 @@ inline double horizontal_max(Vec2d const &a) {
     return max(x[0],x[1]);
 }
 
+#endif
+
 #ifdef __AVX__
 
 inline Vec4d horizontal_add(Vec4d x[4]) {
@@ -74,12 +80,6 @@ Numeric PhyloTree::dotProductSIMD(Numeric *x, Numeric *y, int size) {
 template <class VectorClass, const int VCSIZE, const int nstates>
 void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
 
-    if (dad_branch->node->degree() > 3) {
-        // TODO: SIMD version for multifurcating node
-        computePartialLikelihoodEigen(dad_branch, dad);
-        return;
-    }
-
     // don't recompute the likelihood
 	assert(dad);
     if (dad_branch->partial_lh_computed & 1)
@@ -101,33 +101,39 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 	}
 
     size_t ptn, c;
-    size_t orig_ntn = aln->size();
+    size_t orig_nptn = aln->size();
 
     size_t ncat = site_rate->getNRate();
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
     assert(nstates == aln->num_states && nstates >= VCSIZE && VCSIZE == VectorClass().size());
     assert(model->isReversible()); // only works with reversible model!
     const size_t nstatesqr=nstates*nstates;
     size_t i, x, j;
-    size_t block = nstates * ncat;
+    size_t block = nstates * ncat_mix;
+    size_t tip_block = nstates * model->getNMixtures();
+    size_t scale_size = nptn * ncat_mix;
+
+    size_t mix_addr_nstates[ncat_mix], mix_addr[ncat_mix];
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+    for (c = 0; c < ncat_mix; c++) {
+        size_t m = c/denom;
+        mix_addr_nstates[c] = m*nstates;
+        mix_addr[c] = m*nstatesqr;
+    }
 
 	// internal node
-	assert(node->degree() == 3); // it works only for strictly bifurcating tree
+    dad_branch->lh_scale_factor = 0.0;
 	PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
+    int num_leaves = 0;
 	FOR_NEIGHBOR_IT(node, dad, it) {
+        PhyloNeighbor *nei = (PhyloNeighbor*)*it;
 		if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
+        if ((nei->partial_lh_computed & 1) == 0)
+            computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(nei, node);
+        dad_branch->lh_scale_factor += nei->lh_scale_factor;
+        if ((*it)->node->isLeaf()) num_leaves++;
 	}
 
-	if (!left->node->isLeaf() && right->node->isLeaf()) {
-		// swap left and right
-		PhyloNeighbor *tmp = left;
-		left = right;
-		right = tmp;
-	}
-	if ((left->partial_lh_computed & 1) == 0)
-		computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(left, node);
-	if ((right->partial_lh_computed & 1) == 0)
-		computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(right, node);
-
     if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
         // re-orient partial_lh
         bool done = false;
@@ -149,93 +155,181 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 	double *evec = model->getEigenvectors();
 	double *inv_evec = model->getInverseEigenvectors();
 
-	VectorClass vc_inv_evec[nstates*nstates/VCSIZE];
 	assert(inv_evec && evec);
-	for (i = 0; i < nstates; i++) {
-		for (x = 0; x < nstates/VCSIZE; x++)
-			// inv_evec is not aligned!
-			vc_inv_evec[i*nstates/VCSIZE+x].load_a(&inv_evec[i*nstates+x*VCSIZE]);
-	}
+//	for (i = 0; i < tip_block; i++) {
+//		for (x = 0; x < nstates/VCSIZE; x++)
+//			// inv_evec is not aligned!
+//			vc_inv_evec[i*nstates/VCSIZE+x].load_a(&inv_evec[i*nstates+x*VCSIZE]);
+//	}
 	double *eval = model->getEigenvalues();
 
-	dad_branch->lh_scale_factor = left->lh_scale_factor + right->lh_scale_factor;
 
-	VectorClass *eleft = (VectorClass*)aligned_alloc<double>(block*nstates);
-	VectorClass *eright = (VectorClass*)aligned_alloc<double>(block*nstates);
+    VectorClass *echildren = aligned_alloc<VectorClass>(block*nstates/VCSIZE*(node->degree()-1));
+    double *partial_lh_leaves = NULL;
+    if (num_leaves > 0)
+        partial_lh_leaves = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block*num_leaves);
+    VectorClass *echild = echildren;
+    double *partial_lh_leaf = partial_lh_leaves;
+    
+    
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        VectorClass expchild[nstates/VCSIZE];
+        PhyloNeighbor *child = (PhyloNeighbor*)*it;
+        VectorClass *echild_ptr = echild;
+        // precompute information buffer
+        for (c = 0; c < ncat_mix; c++) {
+            VectorClass len_child = site_rate->getRate(c%ncat) * child->length;
+            double *eval_ptr = eval + mix_addr_nstates[c];
+            double *evec_ptr = evec + mix_addr[c];
+            for (i = 0; i < nstates/VCSIZE; i++) {
+                // eval is not aligned!
+                expchild[i] = exp(VectorClass().load_a(&eval_ptr[i*VCSIZE]) * len_child);
+            }
+            for (x = 0; x < nstates; x++) {
+                for (i = 0; i < nstates/VCSIZE; i++) {
+                    // evec is not be aligned!
+                    echild_ptr[i] = (VectorClass().load_a(&evec_ptr[x*nstates+i*VCSIZE]) * expchild[i]);
+                }
+                echild_ptr += nstates/VCSIZE;
+            }
+        }
 
-	// precompute information buffer
-	for (c = 0; c < ncat; c++) {
-		VectorClass vc_evec;
-		VectorClass expleft[nstates/VCSIZE];
-		VectorClass expright[nstates/VCSIZE];
-		double len_left = site_rate->getRate(c) * left->length;
-		double len_right = site_rate->getRate(c) * right->length;
-		for (i = 0; i < nstates/VCSIZE; i++) {
-			// eval is not aligned!
-			expleft[i] = exp(VectorClass().load_a(&eval[i*VCSIZE]) * VectorClass(len_left));
-			expright[i] = exp(VectorClass().load_a(&eval[i*VCSIZE]) * VectorClass(len_right));
-		}
-		for (x = 0; x < nstates; x++)
-			for (i = 0; i < nstates/VCSIZE; i++) {
-				// evec is not be aligned!
-				vc_evec.load_a(&evec[x*nstates+i*VCSIZE]);
-				eleft[c*nstatesqr/VCSIZE+x*nstates/VCSIZE+i] = (vc_evec * expleft[i]);
-				eright[c*nstatesqr/VCSIZE+x*nstates/VCSIZE+i] = (vc_evec * expright[i]);
-			}
+        // pre compute information for tip
+        if (child->node->isLeaf()) {
+            vector<int>::iterator it;
+            for (it = aln->seq_states[child->node->id].begin(); it != aln->seq_states[child->node->id].end(); it++) {
+                int state = (*it);
+                double *this_partial_lh_leaf = partial_lh_leaf + state*block;
+                VectorClass *echild_ptr = echild;
+                for (c = 0; c < ncat_mix; c++) {
+                    VectorClass *this_tip_partial_lh = (VectorClass*)(tip_partial_lh + state*tip_block + mix_addr_nstates[c]);
+                    for (x = 0; x < nstates; x++) {
+                        VectorClass vchild = 0.0;
+                        for (i = 0; i < nstates/VCSIZE; i++) {
+                            vchild += echild_ptr[i] * this_tip_partial_lh[i];
+                        }
+                        this_partial_lh_leaf[x] = horizontal_add(vchild);
+                        echild_ptr += nstates/VCSIZE;
+                    }
+                    this_partial_lh_leaf += nstates;
+                }
+            }
+            size_t addr = aln->STATE_UNKNOWN * block;
+            for (x = 0; x < block; x++) {
+                partial_lh_leaf[addr+x] = 1.0;
+            }
+            partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
+        }
+        echild += block*nstates/VCSIZE;
+    }
+    
+    VectorClass *eleft = echildren, *eright = echildren + block*nstates/VCSIZE;
+    
+	if (!left->node->isLeaf() && right->node->isLeaf()) {
+		PhyloNeighbor *tmp = left;
+		left = right;
+		right = tmp;
+        VectorClass *etmp = eleft;
+        eleft = eright;
+        eright = etmp;
 	}
+    
+    
+    if (node->degree() > 3) {
 
-	if (left->node->isLeaf() && right->node->isLeaf()) {
-		// special treatment for TIP-TIP (cherry) case
-
-		// pre compute information for both tips
-		double *partial_lh_left = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
-		double *partial_lh_right = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
-
-		vector<int>::iterator it;
-		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
-			int state = (*it);
-			VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
-			VectorClass vleft[VCSIZE];
-			size_t addr = state*nstates;
-			for (i = 0; i < nstates/VCSIZE; i++)
-				vc_partial_lh_tmp[i].load_a(&tip_partial_lh[addr+i*VCSIZE]);
-			for (x = 0; x < block; x+=VCSIZE) {
-				addr = x*nstates/VCSIZE;
-				for (j = 0; j < VCSIZE; j++)
-					vleft[j] = eleft[addr+j*nstates/VCSIZE] * vc_partial_lh_tmp[0];
-				for (i = 1; i < nstates/VCSIZE; i++) {
-					for (j = 0; j < VCSIZE; j++)
-						vleft[j] = mul_add(eleft[addr+j*nstates/VCSIZE+i], vc_partial_lh_tmp[i], vleft[j]);
-				}
-				horizontal_add(vleft).store_a(&partial_lh_left[state*block+x]);
-			}
-		}
+        /*--------------------- multifurcating node ------------------*/
+        // now for-loop computing partial_lh over all site-patterns
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, c, x, i) schedule(static)
+#endif
+        for (ptn = 0; ptn < nptn; ptn++) {
+            double partial_lh_all[block];
+            for (i = 0; i < block; i++)
+                partial_lh_all[i] = 1.0;
+            UBYTE *scale_dad = dad_branch->scale_num + ptn*ncat_mix;
+            memset(scale_dad, 0, sizeof(UBYTE)*ncat_mix);
+
+            double *partial_lh_leaf = partial_lh_leaves;
+            double *echild = (double*)echildren;
+
+            FOR_NEIGHBOR_IT(node, dad, it) {
+                PhyloNeighbor *child = (PhyloNeighbor*)*it;
+                UBYTE *scale_child = child->scale_num + ptn*ncat_mix;
+                if (child->node->isLeaf()) {
+                    // external node
+                    int state_child = (ptn < orig_nptn) ? (aln->at(ptn))[child->node->id] : model_factory->unobserved_ptns[ptn-orig_nptn];
+                    double *child_lh = partial_lh_leaf + state_child*block;
+                    for (c = 0; c < block; c++) {
+                        // compute real partial likelihood vector
+                        partial_lh_all[c] *= child_lh[c];
+                    }
+                    partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
+                } else {
+                    // internal node
+                    double *partial_lh = partial_lh_all;
+                    double *partial_lh_child = child->partial_lh + ptn*block;
+
+                    double *echild_ptr = echild;
+                    for (c = 0; c < ncat_mix; c++) {
+                        scale_dad[c] += scale_child[c];
+                        // compute real partial likelihood vector
+                        for (x = 0; x < nstates; x++) {
+                            double vchild = 0.0;
+//                            double *echild_ptr = echild + (c*nstatesqr+x*nstates);
+                            for (i = 0; i < nstates; i++) {
+                                vchild += echild_ptr[i] * partial_lh_child[i];
+                            }
+                            echild_ptr += nstates;
+                            partial_lh[x] *= vchild;
+                        }
+                        partial_lh += nstates;
+                        partial_lh_child += nstates;
+                    }
+                } // if
+                echild += block*nstates;
+            } // FOR_NEIGHBOR
+            
+        
+            // compute dot-product with inv_eigenvector
+            double *partial_lh_tmp = partial_lh_all;
+            double *partial_lh = dad_branch->partial_lh + ptn*block;
+            for (c = 0; c < ncat_mix; c++) {
+                double lh_max = 0.0;
+                double *inv_evec_ptr = inv_evec + mix_addr[c];
+                for (i = 0; i < nstates; i++) {
+                    double res = 0.0;
+                    for (x = 0; x < nstates; x++) {
+                        res += partial_lh_tmp[x]*inv_evec_ptr[x];
+                    }
+                    inv_evec_ptr += nstates;
+                    partial_lh[i] = res;
+                    lh_max = max(lh_max, fabs(res));
+                }
+                // check if one should scale partial likelihoods
+                if (lh_max < SCALING_THRESHOLD && lh_max != 0.0) {
+                    if (ptn_invar[ptn] == 0.0) {
+                        // now do the likelihood scaling
+                        for (i = 0; i < nstates; i++)
+                            partial_lh[i] *= SCALING_THRESHOLD_INVER;
+                        scale_dad[c] += 1;
+                    }
+                }
+                partial_lh += nstates;
+                partial_lh_tmp += nstates;
+            }
 
-		for (it = aln->seq_states[right->node->id].begin(); it != aln->seq_states[right->node->id].end(); it++) {
-			int state = (*it);
-			VectorClass vright[VCSIZE];
-			VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+        } // for ptn
 
-			for (i = 0; i < nstates/VCSIZE; i++)
-				vc_partial_lh_tmp[i].load_a(&tip_partial_lh[state*nstates+i*VCSIZE]);
-			for (x = 0; x < block; x+=VCSIZE) {
-				for (j = 0; j < VCSIZE; j++)
-					vright[j] = eright[(x+j)*nstates/VCSIZE] * vc_partial_lh_tmp[0];
-				for (i = 1; i < nstates/VCSIZE; i++) {
-					for (j = 0; j < VCSIZE; j++)
-						vright[j] = mul_add(eright[(x+j)*nstates/VCSIZE+i], vc_partial_lh_tmp[i], vright[j]);
-				}
-				horizontal_add(vright).store_a(&partial_lh_right[state*block+x]);
-			}
-		}
+        // end multifurcating treatment
+    } else if (left->node->isLeaf() && right->node->isLeaf()) {
+		// special treatment for TIP-TIP (cherry) case
 
-		size_t addr_unknown = aln->STATE_UNKNOWN * block;
-		for (x = 0; x < block; x++) {
-			partial_lh_left[addr_unknown+x] = 1.0;
-			partial_lh_right[addr_unknown+x] = 1.0;
-		}
+		// pre compute information for both tips
+		double *partial_lh_left = partial_lh_leaves;
+		double *partial_lh_right = partial_lh_leaves + (aln->STATE_UNKNOWN+1)*block;
 
 		// assign pointers for left and right partial_lh
+        /*
 		double **lh_left_ptr = aligned_alloc<double*>(nptn);
 		double **lh_right_ptr = aligned_alloc<double*>(nptn);
 		for (ptn = 0; ptn < orig_ntn; ptn++) {
@@ -246,9 +340,10 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 			lh_left_ptr[ptn] = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
 			lh_right_ptr[ptn] = &partial_lh_right[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
 		}
+        */
 
 		// scale number must be ZERO
-	    memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+	    memset(dad_branch->scale_num, 0, scale_size * sizeof(UBYTE));
 		VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
 		VectorClass res[VCSIZE];
 
@@ -258,9 +353,17 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 		for (ptn = 0; ptn < nptn; ptn++) {
 	        double *partial_lh = dad_branch->partial_lh + ptn*block;
 
-	        double *lh_left = lh_left_ptr[ptn];
-	        double *lh_right = lh_right_ptr[ptn];
-			for (c = 0; c < ncat; c++) {
+	        double *lh_left;
+	        double *lh_right;
+            if (ptn < orig_nptn) {
+                lh_left = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
+                lh_right = &partial_lh_right[block *  (aln->at(ptn))[right->node->id]];
+            } else {
+                lh_left = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_nptn]];
+                lh_right = &partial_lh_right[block * model_factory->unobserved_ptns[ptn-orig_nptn]];
+            }
+			for (c = 0; c < ncat_mix; c++) {
+                VectorClass *vc_inv_evec_ptr = (VectorClass*)(inv_evec + mix_addr[c]);
 				// compute real partial likelihood vector
 
 				for (x = 0; x < nstates/VCSIZE; x++) {
@@ -269,11 +372,11 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 				// compute dot-product with inv_eigenvector
 				for (i = 0; i < nstates; i+=VCSIZE) {
 					for (j = 0; j < VCSIZE; j++) {
-						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec[(i+j)*nstates/VCSIZE];
+						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec_ptr[(i+j)*nstates/VCSIZE];
 					}
 					for (x = 1; x < nstates/VCSIZE; x++)
 						for (j = 0; j < VCSIZE; j++) {
-							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec[(i+j)*nstates/VCSIZE+x], res[j]);
+							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec_ptr[(i+j)*nstates/VCSIZE+x], res[j]);
 						}
 					horizontal_add(res).store_a(&partial_lh[i]);
 				}
@@ -284,43 +387,19 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 			}
 		}
 
-	    aligned_free(lh_left_ptr);
-	    aligned_free(lh_right_ptr);
-		aligned_free(partial_lh_right);
-		aligned_free(partial_lh_left);
+	    //aligned_free(lh_right_ptr);
+	    //aligned_free(lh_left_ptr);
 	} else if (left->node->isLeaf() && !right->node->isLeaf()) {
 		// special treatment to TIP-INTERNAL NODE case
 		// only take scale_num from the right subtree
-		memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
+		memcpy(dad_branch->scale_num, right->scale_num, scale_size * sizeof(UBYTE));
 
 		// pre compute information for left tip
-		double *partial_lh_left = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+		double *partial_lh_left = partial_lh_leaves;
 
 
-		vector<int>::iterator it;
-		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
-			int state = (*it);
-			VectorClass vc_tip_lh[nstates/VCSIZE];
-			VectorClass vleft[VCSIZE];
-			for (i = 0; i < nstates/VCSIZE; i++)
-				vc_tip_lh[i].load_a(&tip_partial_lh[state*nstates+i*VCSIZE]);
-			for (x = 0; x < block; x+=VCSIZE) {
-				for (j = 0; j < VCSIZE; j++)
-					vleft[j] = eleft[(x+j)*nstates/VCSIZE] * vc_tip_lh[0];
-				for (i = 1; i < nstates/VCSIZE; i++) {
-					for (j = 0; j < VCSIZE; j++)
-						vleft[j] = mul_add(eleft[(x+j)*nstates/VCSIZE+i], vc_tip_lh[i], vleft[j]);
-				}
-				horizontal_add(vleft).store_a(&partial_lh_left[state*block+x]);
-			}
-		}
-
-		size_t addr_unknown = aln->STATE_UNKNOWN * block;
-		for (x = 0; x < block; x++) {
-			partial_lh_left[addr_unknown+x] = 1.0;
-		}
-
 		// assign pointers for partial_lh_left
+        /*
 		double **lh_left_ptr = aligned_alloc<double*>(nptn);
 		for (ptn = 0; ptn < orig_ntn; ptn++) {
 			lh_left_ptr[ptn] = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
@@ -328,8 +407,7 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 		for (ptn = orig_ntn; ptn < nptn; ptn++) {
 			lh_left_ptr[ptn] = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
 		}
-
-		double sum_scale = 0.0;
+        */
 		VectorClass vc_lh_right[nstates/VCSIZE];
 		VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
 		VectorClass res[VCSIZE];
@@ -337,15 +415,22 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 		VectorClass vright[VCSIZE];
 
 #ifdef _OPENMP
-#pragma omp parallel for reduction(+: sum_scale) private (ptn, c, x, i, j, vc_lh_right, vc_partial_lh_tmp, res, vc_max, vright)
+#pragma omp parallel for private (ptn, c, x, i, j, vc_lh_right, vc_partial_lh_tmp, res, vc_max, vright)
 #endif
 		for (ptn = 0; ptn < nptn; ptn++) {
 	        double *partial_lh = dad_branch->partial_lh + ptn*block;
 	        double *partial_lh_right = right->partial_lh + ptn*block;
 
-	        double *lh_left = lh_left_ptr[ptn];
-			vc_max = 0.0;
-			for (c = 0; c < ncat; c++) {
+	        double *lh_left;
+            if (ptn < orig_nptn) {
+                lh_left = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
+            } else {
+                lh_left = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_nptn]];
+            }
+
+			for (c = 0; c < ncat_mix; c++) {
+                vc_max = 0.0;
+                VectorClass *vc_inv_evec_ptr = (VectorClass*)(inv_evec + mix_addr[c]);
 				// compute real partial likelihood vector
 				for (i = 0; i < nstates/VCSIZE; i++)
 					vc_lh_right[i].load_a(&partial_lh_right[i*VCSIZE]);
@@ -365,46 +450,37 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 				// compute dot-product with inv_eigenvector
 				for (i = 0; i < nstates; i+=VCSIZE) {
 					for (j = 0; j < VCSIZE; j++) {
-						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec[(i+j)*nstates/VCSIZE];
+						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec_ptr[(i+j)*nstates/VCSIZE];
 					}
 					for (x = 1; x < nstates/VCSIZE; x++) {
 						for (j = 0; j < VCSIZE; j++) {
-							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec[(i+j)*nstates/VCSIZE+x], res[j]);
+							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec_ptr[(i+j)*nstates/VCSIZE+x], res[j]);
 						}
 					}
 					VectorClass sum_res = horizontal_add(res);
 					sum_res.store_a(&partial_lh[i]);
 					vc_max = max(vc_max, abs(sum_res)); // take the maximum for scaling check
 				}
+                // check if one should scale partial likelihoods
+                double lh_max = horizontal_max(vc_max);
+                if (lh_max < SCALING_THRESHOLD && ptn_invar[ptn] == 0.0 && lh_max != 0.0) {
+                    // now do the likelihood scaling
+                    VectorClass scale_thres(SCALING_THRESHOLD_INVER);
+                    for (i = 0; i < block; i+=VCSIZE) {
+                        (VectorClass().load_a(&partial_lh[i]) * scale_thres).store_a(&partial_lh[i]);
+                    }
+                    dad_branch->scale_num[ptn*ncat_mix+c] += 1;
+                }
 				lh_left += nstates;
 				partial_lh_right += nstates;
 				partial_lh += nstates;
 			}
-            // check if one should scale partial likelihoods
-			double lh_max = horizontal_max(vc_max);
-            if (lh_max < SCALING_THRESHOLD && ptn_invar[ptn] == 0.0) {
-            	// now do the likelihood scaling
-            	partial_lh -= block; // revert its pointer
-            	VectorClass scale_thres(SCALING_THRESHOLD_INVER);
-				for (i = 0; i < block; i+=VCSIZE) {
-					(VectorClass().load_a(&partial_lh[i]) * scale_thres).store_a(&partial_lh[i]);
-				}
-				// unobserved const pattern will never have underflow
-				sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
-				dad_branch->scale_num[ptn] += 1;
-				partial_lh += block; // increase the pointer again
-            }
 
 		}
-		dad_branch->lh_scale_factor += sum_scale;
-
-	    aligned_free(lh_left_ptr);
-		aligned_free(partial_lh_left);
 
 	} else {
 		// both left and right are internal node
 
-		double sum_scale = 0.0;
 		VectorClass vc_max; // maximum of partial likelihood, for scaling check
 		VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
 		VectorClass vc_lh_left[nstates/VCSIZE], vc_lh_right[nstates/VCSIZE];
@@ -412,16 +488,20 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 		VectorClass vleft[VCSIZE], vright[VCSIZE];
 
 #ifdef _OPENMP
-#pragma omp parallel for reduction (+: sum_scale) private(ptn, c, x, i, j, vc_max, vc_partial_lh_tmp, vc_lh_left, vc_lh_right, res, vleft, vright)
+#pragma omp parallel for private(ptn, c, x, i, j, vc_max, vc_partial_lh_tmp, vc_lh_left, vc_lh_right, res, vleft, vright)
 #endif
 		for (ptn = 0; ptn < nptn; ptn++) {
 	        double *partial_lh = dad_branch->partial_lh + ptn*block;
 			double *partial_lh_left = left->partial_lh + ptn*block;
 			double *partial_lh_right = right->partial_lh + ptn*block;
-
-			dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
-			vc_max = 0.0;
-			for (c = 0; c < ncat; c++) {
+            UBYTE *scale_dad = dad_branch->scale_num + ptn*ncat_mix;
+            UBYTE *scale_left = left->scale_num + ptn*ncat_mix;
+            UBYTE *scale_right = right->scale_num + ptn*ncat_mix; 
+
+			for (c = 0; c < ncat_mix; c++) {
+                scale_dad[c] = scale_left[c] + scale_right[c];
+                vc_max = 0.0;
+                VectorClass *vc_inv_evec_ptr = (VectorClass*)(inv_evec + mix_addr[c]);
 				// compute real partial likelihood vector
 				for (i = 0; i < nstates/VCSIZE; i++) {
 					vc_lh_left[i].load_a(&partial_lh_left[i*VCSIZE]);
@@ -447,43 +527,40 @@ void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, Phy
 				// compute dot-product with inv_eigenvector
 				for (i = 0; i < nstates; i+=VCSIZE) {
 					for (j = 0; j < VCSIZE; j++) {
-						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec[(i+j)*nstates/VCSIZE];
+						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec_ptr[(i+j)*nstates/VCSIZE];
 					}
 					for (x = 1; x < nstates/VCSIZE; x++)
 						for (j = 0; j < VCSIZE; j++)
-							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec[(i+j)*nstates/VCSIZE+x], res[j]);
+							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec_ptr[(i+j)*nstates/VCSIZE+x], res[j]);
 
 					VectorClass sum_res = horizontal_add(res);
 					sum_res.store_a(&partial_lh[i]);
 					vc_max = max(vc_max, abs(sum_res)); // take the maximum for scaling check
 				}
+                // check if one should scale partial likelihoods
+                double lh_max = horizontal_max(vc_max);
+                if (lh_max < SCALING_THRESHOLD && ptn_invar[ptn] == 0.0 && lh_max != 0.0) {
+                    // now do the likelihood scaling
+                    VectorClass scale_thres(SCALING_THRESHOLD_INVER);
+                    for (i = 0; i < block; i+=VCSIZE) {
+                        (VectorClass().load_a(&partial_lh[i]) * scale_thres).store_a(&partial_lh[i]);
+                    }
+                    // unobserved const pattern will never have underflow
+                    scale_dad[c] += 1;
+                }
 				partial_lh += nstates;
 				partial_lh_left += nstates;
 				partial_lh_right += nstates;
 			}
 
-            // check if one should scale partial likelihoods
-			double lh_max = horizontal_max(vc_max);
-            if (lh_max < SCALING_THRESHOLD && ptn_invar[ptn] == 0.0) {
-				// now do the likelihood scaling
-            	partial_lh -= block; // revert its pointer
-            	VectorClass scale_thres(SCALING_THRESHOLD_INVER);
-				for (i = 0; i < block; i+=VCSIZE) {
-					(VectorClass().load_a(&partial_lh[i]) * scale_thres).store_a(&partial_lh[i]);
-				}
-				// unobserved const pattern will never have underflow
-				sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
-				dad_branch->scale_num[ptn] += 1;
-				partial_lh += block; // increase the pointer again
-            }
 
 		}
-		dad_branch->lh_scale_factor += sum_scale;
 
 	}
 
-	aligned_free(eright);
-	aligned_free(eleft);
+	if (partial_lh_leaves)
+        aligned_free(partial_lh_leaves);
+	aligned_free(echildren);
 }
 
 template <class VectorClass, const int VCSIZE, const int nstates>
@@ -506,14 +583,19 @@ void PhyloTree::computeLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloN
         computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(node_branch, node);
     df = ddf = 0.0;
     size_t ncat = site_rate->getNRate();
-
-    size_t block = ncat * nstates;
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
+    size_t block = ncat_mix * nstates;
+    size_t tip_block = nstates * model->getNMixtures();
     size_t ptn; // for big data size > 4GB memory required
     size_t c, i, j;
     size_t orig_nptn = aln->size();
     size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
     size_t maxptn = ((nptn+VCSIZE-1)/VCSIZE)*VCSIZE;
     maxptn = max(maxptn, aln->size()+((model_factory->unobserved_ptns.size()+VCSIZE-1)/VCSIZE)*VCSIZE);
+
+    size_t mix_addr_nstates[ncat_mix];
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+
     double *eval = model->getEigenvalues();
     assert(eval);
 
@@ -522,11 +604,15 @@ void PhyloTree::computeLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloN
 	VectorClass *vc_val2 = (VectorClass*)aligned_alloc<double>(block);
 
 	VectorClass vc_len = dad_branch->length;
-	for (c = 0; c < ncat; c++) {
-		VectorClass vc_rate = site_rate->getRate(c);
-		VectorClass vc_prop = site_rate->getProp(c);
+	for (c = 0; c < ncat_mix; c++) {
+        size_t m = c/denom;
+        mix_addr_nstates[c] = m*nstates;
+        size_t mycat = c%ncat;
+        double *eval_ptr = eval + m*nstates;
+		VectorClass vc_rate = site_rate->getRate(mycat);
+		VectorClass vc_prop = site_rate->getProp(mycat) * model->getMixtureWeight(m);
 		for (i = 0; i < nstates/VCSIZE; i++) {
-			VectorClass cof = VectorClass().load_a(&eval[i*VCSIZE]) * vc_rate;
+			VectorClass cof = VectorClass().load_a(&eval_ptr[i*VCSIZE]) * vc_rate;
 			VectorClass val = exp(cof*vc_len) * vc_prop;
 			VectorClass val1_ = cof*val;
 			vc_val0[c*nstates/VCSIZE+i] = val;
@@ -538,41 +624,79 @@ void PhyloTree::computeLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloN
 	assert(theta_all);
 	if (!theta_computed) {
 		theta_computed = true;
+        double scale_all = 0.0;
 		// precompute theta for fast branch length optimization
 
 		if (dad->isLeaf()) {
 	    	// special treatment for TIP-INTERNAL NODE case
 #ifdef _OPENMP
-#pragma omp parallel for private(ptn, i)
+#pragma omp parallel for private(ptn, i, c) reduction(+: scale_all)
 #endif
-			for (ptn = 0; ptn < orig_nptn; ptn++) {
+			for (ptn = 0; ptn < nptn; ptn++) {
 			    double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+                UBYTE *scale_dad = dad_branch->scale_num+ptn*ncat_mix;
 				double *theta = theta_all + ptn*block;
-				double *lh_dad = &tip_partial_lh[(aln->at(ptn))[dad->id] * nstates];
-				for (i = 0; i < block; i+=VCSIZE) {
-					(VectorClass().load_a(&lh_dad[i%nstates]) * VectorClass().load_a(&partial_lh_dad[i])).store_a(&theta[i]);
-				}
-			}
-			// ascertainment bias correction
-			for (ptn = orig_nptn; ptn < nptn; ptn++) {
-			    double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
-				double *theta = theta_all + ptn*block;
-				double *lh_dad = &tip_partial_lh[model_factory->unobserved_ptns[ptn-orig_nptn] * nstates];
-				for (i = 0; i < block; i+=VCSIZE) {
-					(VectorClass().load_a(&lh_dad[i%nstates]) * VectorClass().load_a(&partial_lh_dad[i])).store_a(&theta[i]);
-				}
+                double *this_tip_partial_lh = tip_partial_lh + tip_block*((ptn < orig_nptn) ? (aln->at(ptn))[dad->id] :  model_factory->unobserved_ptns[ptn-orig_nptn]);
+                UBYTE min_scale = scale_dad[0];
+                for (c = 1; c < ncat_mix; c++)
+                    min_scale = min(min_scale, scale_dad[c]);
+
+                scale_all += (double)min_scale;
+
+                for (c = 0; c < ncat_mix; c++) {
+                    double *lh_dad = this_tip_partial_lh + mix_addr_nstates[c];
+                    if (scale_dad[c] == min_scale) {
+                        for (i = 0; i < nstates; i+=VCSIZE) {
+                            (VectorClass().load_a(&lh_dad[i]) * VectorClass().load_a(&partial_lh_dad[i])).store_a(&theta[i]);
+                        }
+                    } else if (scale_dad[c] == min_scale+1) {
+                        for (i = 0; i < nstates; i+=VCSIZE) {
+                            (VectorClass().load_a(&lh_dad[i]) * VectorClass().load_a(&partial_lh_dad[i]) * VectorClass(SCALING_THRESHOLD)).store_a(&theta[i]);
+                        }
+                    } else {
+                        memset(theta, 0, sizeof(double)*nstates);
+                    }
+                    partial_lh_dad += nstates;
+                    theta += nstates;
+                }
 			}
 	    } else {
 	    	// both dad and node are internal nodes
-		    double *partial_lh_node = node_branch->partial_lh;
-		    double *partial_lh_dad = dad_branch->partial_lh;
-	    	size_t all_entries = nptn*block;
 #ifdef _OPENMP
-#pragma omp parallel for private(i)
+#pragma omp parallel for private(i, c) reduction(+: scale_all)
 #endif
-	    	for (i = 0; i < all_entries; i+=VCSIZE) {
-				(VectorClass().load_a(&partial_lh_node[i]) * VectorClass().load_a(&partial_lh_dad[i]))
-						.store_a(&theta_all[i]);
+	    	for (ptn = 0; ptn < nptn; ptn++) {
+				double *theta = theta_all + ptn*block;
+			    double *partial_lh_node = node_branch->partial_lh + ptn*block;
+			    double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+
+                size_t ptn_ncat = ptn*ncat_mix; 
+                UBYTE *scale_dad = dad_branch->scale_num + ptn_ncat;
+                UBYTE *scale_node = node_branch->scale_num + ptn_ncat;
+                UBYTE sum_scale[ncat_mix];
+                UBYTE min_scale = sum_scale[0] = scale_dad[0] + scale_node[0];
+                for (c = 1; c < ncat_mix; c++) {
+                    sum_scale[c] = scale_dad[c] + scale_node[c];
+                    min_scale = min(min_scale, sum_scale[c]);
+                }
+                scale_all += (double)min_scale;
+
+                for (c = 0; c < ncat_mix; c++) {
+                    if (sum_scale[c] == min_scale) {
+                        for (i = 0; i < nstates; i++) {
+                            (VectorClass().load_a(&partial_lh_node[i]) * VectorClass().load_a(&partial_lh_dad[i])).store_a(&theta[i]);
+                        }
+                    } else if (sum_scale[c] == min_scale+1) {
+                        for (i = 0; i < nstates; i++) {
+                            (VectorClass().load_a(&partial_lh_node[i]) * VectorClass().load_a(&partial_lh_dad[i]) * VectorClass(SCALING_THRESHOLD)).store_a(&theta[i]);
+                        }
+                    } else {
+                        memset(theta, 0, sizeof(double)*nstates);
+                    }
+                    theta += nstates;
+                    partial_lh_dad += nstates;
+                    partial_lh_node += nstates;
+                }
 			}
 	    }
 		if (nptn < maxptn) {
@@ -580,6 +704,7 @@ void PhyloTree::computeLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloN
 			for (ptn = nptn; ptn < maxptn; ptn++)
 				memcpy(&theta_all[ptn*block], theta_all, block*sizeof(double));
 		}
+        buffer_scale_all = scale_all*LOG_SCALING_THRESHOLD;
 	}
 
 
@@ -648,12 +773,8 @@ void PhyloTree::computeLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloN
 #endif
 	df = horizontal_add(df_final);
 	ddf = horizontal_add(ddf_final);
-    if (isnan(df) || isinf(df)) {
-        df = 0.0;
-        ddf = 0.0;
-//        outWarning("Numerical instability (some site-likelihood = 0)");
-    }
-
+    
+    assert(!isnan(df) && !isinf(df) && "Numerical underflow for SIMD lh-derivative");
 
 //	assert(isnormal(tree_lh));
 	if (orig_nptn < nptn) {
@@ -754,8 +875,12 @@ double PhyloTree::computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, Ph
         computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(node_branch, node);
     double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
     size_t ncat = site_rate->getNRate();
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+    size_t mix_addr_nstates[ncat_mix];
 
-    size_t block = ncat * nstates;
+    size_t block = ncat_mix * nstates;
+    size_t tip_block = nstates * model->getNMixtures();
     size_t ptn; // for big data size > 4GB memory required
     size_t c, i, j;
     size_t orig_nptn = aln->size();
@@ -768,13 +893,16 @@ double PhyloTree::computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, Ph
     VectorClass *vc_val = (VectorClass*)aligned_alloc<double>(block);
 
 
-	for (c = 0; c < ncat; c++) {
-		double len = site_rate->getRate(c)*dad_branch->length;
-		VectorClass vc_len(len);
-		VectorClass vc_prop(site_rate->getProp(c));
+	for (c = 0; c < ncat_mix; c++) {
+        size_t mycat = c%ncat;
+        size_t m = c/denom;
+        mix_addr_nstates[c] = m*nstates;
+        double *eval_ptr = eval + mix_addr_nstates[c];
+		VectorClass vc_len(site_rate->getRate(mycat)*dad_branch->length);
+		VectorClass vc_prop(site_rate->getProp(c) * model->getMixtureWeight(m));
 		for (i = 0; i < nstates/VCSIZE; i++) {
 			// eval is not aligned!
-			vc_val[c*nstates/VCSIZE+i] = exp(VectorClass().load_a(&eval[i*VCSIZE]) * vc_len) * vc_prop;
+			vc_val[c*nstates/VCSIZE+i] = exp(VectorClass().load_a(&eval_ptr[i*VCSIZE]) * vc_len) * vc_prop;
 		}
 	}
 
@@ -782,55 +910,84 @@ double PhyloTree::computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, Ph
 
 	if (dad->isLeaf()) {
     	// special treatment for TIP-INTERNAL NODE case
-    	VectorClass vc_tip_partial_lh[nstates];
-    	VectorClass vc_partial_lh_dad[VCSIZE], vc_ptn[VCSIZE];
+
+    	// precompute information from one tip        
+    	double *partial_lh_node = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+    	IntVector states_dad = aln->seq_states[dad->id];
+    	states_dad.push_back(aln->STATE_UNKNOWN);
+    	for (IntVector::iterator it = states_dad.begin(); it != states_dad.end(); it++) {
+    		double *lh_node = partial_lh_node + (*it)*block;
+    		double *lh_tip = tip_partial_lh + (*it)*tip_block;
+    		VectorClass *vc_val_tmp = vc_val;
+            for (c = 0; c < ncat_mix; c++) {
+                double *this_lh_tip = lh_tip + mix_addr_nstates[c];
+                for (i = 0; i < nstates; i+=VCSIZE) {
+                    (vc_val_tmp[i/VCSIZE] * VectorClass().load_a(&this_lh_tip[i])).store_a(&lh_node[i]);
+                }
+                lh_node += nstates;
+                vc_val_tmp += nstates/VCSIZE;
+            }
+    	}
+
+
+    	//VectorClass vc_tip_partial_lh[nstates];
+    	//VectorClass vc_partial_lh_dad[VCSIZE]
+        VectorClass vc_ptn[VCSIZE];
     	VectorClass lh_final(0.0), vc_freq;
 		VectorClass lh_ptn; // store likelihoods of VCSIZE consecutive patterns
 
-    	double **lh_states_dad = aligned_alloc<double*>(maxptn);
-    	for (ptn = 0; ptn < orig_nptn; ptn++)
-    		lh_states_dad[ptn] = &tip_partial_lh[(aln->at(ptn))[dad->id] * nstates];
-    	for (ptn = orig_nptn; ptn < nptn; ptn++)
-    		lh_states_dad[ptn] = &tip_partial_lh[model_factory->unobserved_ptns[ptn-orig_nptn] * nstates];
-    	// initialize beyond #patterns for efficiency
-    	for (ptn = nptn; ptn < maxptn; ptn++)
-    		lh_states_dad[ptn] = &tip_partial_lh[aln->STATE_UNKNOWN * nstates];
+		int *ptn_states_dad = aligned_alloc<int>(maxptn);
+		for (ptn = 0; ptn < orig_nptn; ptn++)
+			ptn_states_dad[ptn] = (aln->at(ptn))[dad->id];
+		for (ptn = orig_nptn; ptn < nptn; ptn++)
+			ptn_states_dad[ptn] = model_factory->unobserved_ptns[ptn-orig_nptn];
+		// initialize beyond #patterns for efficiency
+		for (ptn = nptn; ptn < maxptn; ptn++)
+			ptn_states_dad[ptn] = aln->STATE_UNKNOWN;
 
 		// copy dummy values because VectorClass will access beyond nptn
 		for (ptn = nptn; ptn < maxptn; ptn++)
 			memcpy(&dad_branch->partial_lh[ptn*block], dad_branch->partial_lh, block*sizeof(double));
 
 #ifdef _OPENMP
-#pragma omp parallel private(ptn, i, j, vc_tip_partial_lh, vc_partial_lh_dad, vc_ptn, vc_freq, lh_ptn)
+#pragma omp parallel private(ptn, i, j, vc_ptn, vc_freq, lh_ptn)
     {
     	VectorClass lh_final_th = 0.0;
 #pragma omp for nowait
 #endif
    		// main loop over all patterns with a step size of VCSIZE
 		for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
-			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
-
-			// initialize vc_tip_partial_lh
+			//double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+            VectorClass vc_scale;
 			for (j = 0; j < VCSIZE; j++) {
-				double *lh_dad = lh_states_dad[ptn+j];
-				for (i = 0; i < nstates/VCSIZE; i++) {
-					vc_tip_partial_lh[j*(nstates/VCSIZE)+i].load_a(&lh_dad[i*VCSIZE]);
-				}
-				vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block]);
-				vc_ptn[j] = vc_val[0] * vc_tip_partial_lh[j*(nstates/VCSIZE)] * vc_partial_lh_dad[j];
+                vc_ptn[j] = 0.0;
+				double *partial_lh_dad = dad_branch->partial_lh + (ptn+j)*block;
+                UBYTE *scale_dad = dad_branch->scale_num + (ptn+j)*ncat_mix;
+                // determine the min scaling
+                UBYTE min_scale = *min_element(scale_dad, scale_dad+ncat_mix);
+                vc_scale.insert(j, (double)min_scale);
+
+				double *lh_node = &partial_lh_node[ptn_states_dad[ptn+j]*block];
+
+                for (c = 0; c < ncat_mix; c++) {
+                    VectorClass this_vc_ptn = 0.0;
+                    if (scale_dad[c] <= min_scale+1) {
+                        for (i = 0; i < nstates; i+=VCSIZE) {
+                            this_vc_ptn = mul_add(VectorClass().load_a(&lh_node[i]), VectorClass().load_a(&partial_lh_dad[i]), this_vc_ptn);
+                        }
+                        if (scale_dad[c] == min_scale)
+                            vc_ptn[j] += this_vc_ptn;
+                        else
+                            vc_ptn[j] += this_vc_ptn * VectorClass(SCALING_THRESHOLD);
+                    }
+                    lh_node += nstates;
+                    partial_lh_dad += nstates;
+                }
 			}
 
-			// compute vc_ptn
-			for (i = 1; i < block/VCSIZE; i++)
-				for (j = 0; j < VCSIZE; j++) {
-					vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block+i*VCSIZE]);
-					vc_ptn[j] = mul_add(vc_val[i] * vc_tip_partial_lh[j*(nstates/VCSIZE)+i%(nstates/VCSIZE)],
-							vc_partial_lh_dad[j], vc_ptn[j]);
-				}
-
 			vc_freq.load_a(&ptn_freq[ptn]);
 			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
-			lh_ptn = log(abs(lh_ptn));
+			lh_ptn = log(abs(lh_ptn)) + vc_scale*LOG_SCALING_THRESHOLD;
 			lh_ptn.store_a(&_pattern_lh[ptn]);
 
 			// multiply with pattern frequency
@@ -849,57 +1006,46 @@ double PhyloTree::computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, Ph
     }
 #endif
 		tree_lh += horizontal_add(lh_final);
-        if (isnan(tree_lh) || isinf(tree_lh)) {
-            cout << "WARNING: Numerical underflow caused by alignment sites";
-            i = aln->getNSite();
-            for (j = 0; j < i; j++) {
-                ptn = aln->getPatternID(j);
-                if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
-                	cout << " " << j+1;
-                }
-            }
-            tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
-            for (ptn = 0; ptn < orig_nptn; ptn++) {
-                if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
-                	_pattern_lh[ptn] = LOG_SCALING_THRESHOLD*4; // log(2^(-1024))
-                }
-            	tree_lh += _pattern_lh[ptn] * ptn_freq[ptn];
-            }
-            cout << endl;
-//            cout << "WARNING: Tree log-likelihood is set to " << tree_lh << endl;
-        }
 
-		// ascertainment bias correction
+        assert(!isnan(tree_lh) & !isinf(tree_lh) && "Numerical underflow for SIMD lh-branch");
+
+        // ascertainment bias correction
 		if (orig_nptn < nptn) {
 			lh_final = 0.0;
 			lh_ptn = 0.0;
 			for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
-				double *partial_lh_dad = &dad_branch->partial_lh[ptn*block];
+//				double *partial_lh_dad = &dad_branch->partial_lh[ptn*block];
+                VectorClass vc_scale;
 				lh_final += lh_ptn;
-
-				// initialize vc_tip_partial_lh
 				for (j = 0; j < VCSIZE; j++) {
-					double *lh_dad = lh_states_dad[ptn+j];
-					for (i = 0; i < nstates/VCSIZE; i++) {
-						vc_tip_partial_lh[j*(nstates/VCSIZE)+i].load(&lh_dad[i*VCSIZE]); // lh_dad is not aligned!
-					}
-					vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block]);
-					vc_ptn[j] = vc_val[0] * vc_tip_partial_lh[j*(nstates/VCSIZE)] * vc_partial_lh_dad[j];
+					vc_ptn[j] = 0.0;
+					double *partial_lh_dad = dad_branch->partial_lh + (ptn+j)*block;
+                    UBYTE *scale_dad = dad_branch->scale_num + (ptn+j)*ncat_mix;
+                    // determine the min scaling
+                    UBYTE min_scale = *min_element(scale_dad, scale_dad+ncat_mix);
+                    vc_scale.insert(j, min_scale);
+
+					int state_dad = ptn_states_dad[ptn+j];
+					double *lh_node = &partial_lh_node[state_dad*block];
+
+                    for (c = 0; c < ncat_mix; c++) {
+                        VectorClass this_vc_ptn = 0.0;
+                        if (scale_dad[c] <= min_scale+1) {
+                            for (i = 0; i < nstates; i+=VCSIZE) {
+                                this_vc_ptn = mul_add(VectorClass().load_a(&lh_node[i]), VectorClass().load_a(&partial_lh_dad[i]), this_vc_ptn);
+                            }
+                            if (scale_dad[c] == min_scale)
+                                vc_ptn[j] += this_vc_ptn;
+                            else
+                                vc_ptn[j] += this_vc_ptn * VectorClass(SCALING_THRESHOLD);
+                        }
+                        lh_node += nstates;
+                        partial_lh_dad += nstates;
+                    }
+                    // bugfix 2016-01-21, prob_const can be rescaled
+                    if (min_scale >= 1)
+                        vc_ptn[j] = vc_ptn[j] * VectorClass(SCALING_THRESHOLD);
 				}
-
-				// compute vc_ptn
-				for (i = 1; i < block/VCSIZE; i++)
-					for (j = 0; j < VCSIZE; j++) {
-						vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block+i*VCSIZE]);
-						vc_ptn[j] = mul_add(vc_val[i] * vc_tip_partial_lh[j*(nstates/VCSIZE)+i%(nstates/VCSIZE)],
-								vc_partial_lh_dad[j], vc_ptn[j]);
-					}
-                    
-                // bugfix 2016-01-21, prob_const can be rescaled
-                for (j = 0; j < VCSIZE; j++)
-                    if (dad_branch->scale_num[ptn+j] >= 1)
-                        vc_ptn[j] = vc_ptn[j] * SCALING_THRESHOLD;
-
 				// ptn_invar[ptn] is not aligned
 				lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
 			}
@@ -911,7 +1057,9 @@ double PhyloTree::computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, Ph
 			default: assert(0); break;
 			}
 		}
-		aligned_free(lh_states_dad);
+		aligned_free(ptn_states_dad);
+		aligned_free(partial_lh_node);
+
     } else {
     	// both dad and node are internal nodes
     	VectorClass vc_partial_lh_node[VCSIZE];
@@ -932,25 +1080,46 @@ double PhyloTree::computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, Ph
 #pragma omp for nowait
 #endif
 		for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
-			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
-			double *partial_lh_node = node_branch->partial_lh + ptn*block;
 
-			for (j = 0; j < VCSIZE; j++)
-				vc_ptn[j] = 0.0;
+            VectorClass vc_scale;
+			for (j = 0; j < VCSIZE; j++) {
+                vc_ptn[j] = 0.0;
+				double *partial_lh_dad = dad_branch->partial_lh + (ptn+j)*block;
+                double *partial_lh_node = node_branch->partial_lh + (ptn+j)*block;
+                VectorClass *val_tmp = vc_val;
+                UBYTE *scale_dad = dad_branch->scale_num + (ptn+j)*ncat_mix;
+                UBYTE *scale_node = node_branch->scale_num + (ptn+j)*ncat_mix;
+                // determine the min scaling
+                UBYTE sum_scale[ncat_mix];
+                UBYTE min_scale = sum_scale[0] = scale_dad[0]+scale_node[0];
+                for (c = 1; c < ncat_mix; c++) {
+                    sum_scale[c] = scale_dad[c] + scale_node[c];
+                    min_scale = min(min_scale, sum_scale[c]);
+                }
+                vc_scale.insert(j, min_scale);
 
-			for (i = 0; i < block; i+=VCSIZE) {
-				for (j = 0; j < VCSIZE; j++) {
-					vc_partial_lh_node[j].load_a(&partial_lh_node[i+j*block]);
-					vc_partial_lh_dad[j].load_a(&partial_lh_dad[i+j*block]);
-					vc_ptn[j] = mul_add(vc_val[i/VCSIZE] * vc_partial_lh_node[j], vc_partial_lh_dad[j], vc_ptn[j]);
-				}
+                for (c = 0; c < ncat_mix; c++) {
+                    if (sum_scale[c] <= min_scale+1) {
+                        VectorClass this_vc_ptn = 0.0;
+                        for (i = 0; i < nstates; i+=VCSIZE) {
+                            this_vc_ptn = mul_add(VectorClass().load_a(&partial_lh_node[i]) * VectorClass().load_a(&partial_lh_dad[i]), val_tmp[i/VCSIZE], this_vc_ptn);
+                        }
+                        if (sum_scale[c] == min_scale)
+                            vc_ptn[j] += this_vc_ptn;
+                        else
+                            vc_ptn[j] += this_vc_ptn * VectorClass(SCALING_THRESHOLD);
+                    }
+                    partial_lh_node += nstates;
+                    partial_lh_dad += nstates;
+                    val_tmp += nstates/VCSIZE;
+                }
 			}
 
 			vc_freq.load_a(&ptn_freq[ptn]);
 
 			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
 
-			lh_ptn = log(abs(lh_ptn));
+			lh_ptn = log(abs(lh_ptn)) + vc_scale*LOG_SCALING_THRESHOLD;
 			lh_ptn.store_a(&_pattern_lh[ptn]);
 #ifdef _OPENMP
 			lh_final_th = mul_add(lh_ptn, vc_freq, lh_final_th);
@@ -973,12 +1142,47 @@ double PhyloTree::computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, Ph
 			// ascertainment bias correction
 			lh_final = 0.0;
 			lh_ptn = 0.0;
-			double *partial_lh_node = &node_branch->partial_lh[orig_nptn*block];
-			double *partial_lh_dad = &dad_branch->partial_lh[orig_nptn*block];
 
 			for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
 				lh_final += lh_ptn;
 
+                VectorClass vc_scale;
+                for (j = 0; j < VCSIZE; j++) {
+                    vc_ptn[j] = 0.0;
+                    double *partial_lh_dad = dad_branch->partial_lh + (ptn+j)*block;
+                    double *partial_lh_node = node_branch->partial_lh + (ptn+j)*block;
+                    VectorClass *val_tmp = vc_val;
+                    UBYTE *scale_dad = dad_branch->scale_num + (ptn+j)*ncat_mix;
+                    UBYTE *scale_node = node_branch->scale_num + (ptn+j)*ncat_mix;
+                    // determine the min scaling
+                    UBYTE sum_scale[ncat_mix];
+                    UBYTE min_scale = sum_scale[0] = scale_dad[0]+scale_node[0];
+                    for (c = 1; c < ncat_mix; c++) {
+                        sum_scale[c] = scale_dad[c] + scale_node[c];
+                        min_scale = min(min_scale, sum_scale[c]);
+                    }
+                    vc_scale.insert(j, min_scale);
+
+                    for (c = 0; c < ncat_mix; c++) {
+                        if (sum_scale[c] <= min_scale+1) {
+                            VectorClass this_vc_ptn = 0.0;
+                            for (i = 0; i < nstates; i+=VCSIZE) {
+                                this_vc_ptn = mul_add(VectorClass().load_a(&partial_lh_node[i]) * VectorClass().load_a(&partial_lh_dad[i]), val_tmp[i/VCSIZE], this_vc_ptn);
+                            }
+                            if (sum_scale[c] == min_scale)
+                                vc_ptn[j] += this_vc_ptn;
+                            else
+                                vc_ptn[j] += this_vc_ptn * VectorClass(SCALING_THRESHOLD);
+                        }
+                        partial_lh_node += nstates;
+                        partial_lh_dad += nstates;
+                        val_tmp += nstates/VCSIZE;
+                    }
+                    if (min_scale >= 1)
+                        vc_ptn[j] *= VectorClass(SCALING_THRESHOLD);
+                }
+
+                /*
 				for (j = 0; j < VCSIZE; j++)
 					vc_ptn[j] = 0.0;
 
@@ -990,15 +1194,14 @@ double PhyloTree::computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, Ph
 					}
 				}
 
+
                 // bugfix 2016-01-21, prob_const can be rescaled
                 for (j = 0; j < VCSIZE; j++)
                     if (dad_branch->scale_num[ptn+j] + node_branch->scale_num[ptn+j] >= 1)
                         vc_ptn[j] = vc_ptn[j] * SCALING_THRESHOLD;
-
+                */
 				// ptn_invar[ptn] is not aligned
 				lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
-				partial_lh_node += block*VCSIZE;
-				partial_lh_dad += block*VCSIZE;
 			}
 			switch ((nptn-orig_nptn)%VCSIZE) {
 			case 0: prob_const = horizontal_add(lh_final+lh_ptn); break;
@@ -1032,7 +1235,10 @@ double PhyloTree::computeLikelihoodFromBufferEigenSIMD() {
 	double tree_lh = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
 
     size_t ncat = site_rate->getNRate();
-    size_t block = ncat * nstates;
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+
+    size_t block = ncat_mix * nstates;
     size_t ptn; // for big data size > 4GB memory required
     size_t c, i, j;
     size_t orig_nptn = aln->size();
@@ -1044,11 +1250,14 @@ double PhyloTree::computeLikelihoodFromBufferEigenSIMD() {
 	VectorClass *vc_val0 = (VectorClass*)aligned_alloc<double>(block);
 
 	VectorClass vc_len = current_it->length;
-	for (c = 0; c < ncat; c++) {
-		VectorClass vc_rate = site_rate->getRate(c);
-		VectorClass vc_prop = site_rate->getProp(c);
+	for (c = 0; c < ncat_mix; c++) {
+        size_t m = c/denom;
+        double *eval_ptr = eval + (m)*nstates;
+        size_t mycat = c%ncat;
+		VectorClass vc_rate = site_rate->getRate(mycat);
+		VectorClass vc_prop = site_rate->getProp(mycat) * model->getMixtureWeight(m);
 		for (i = 0; i < nstates/VCSIZE; i++) {
-			VectorClass cof = VectorClass().load_a(&eval[i*VCSIZE]) * vc_rate;
+			VectorClass cof = VectorClass().load_a(&eval_ptr[i*VCSIZE]) * vc_rate;
 			VectorClass val = exp(cof*vc_len) * vc_prop;
 			vc_val0[c*nstates/VCSIZE+i] = val;
 		}
@@ -1100,30 +1309,9 @@ double PhyloTree::computeLikelihoodFromBufferEigenSIMD() {
 	}
 }
 #endif
-	tree_lh += horizontal_add(lh_final);
-    if (isnan(tree_lh) || isinf(tree_lh)) {
-        cout << "WARNING: Numerical underflow caused by alignment sites";
-        i = aln->getNSite();
-        for (j = 0, c = 0; j < i; j++) {
-            ptn = aln->getPatternID(j);
-            if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
-                cout << " " << j+1;
-                c++;
-                if (c >= 10) {
-                    cout << " ...";
-                    break;
-                }
-            }
-        }
-        cout << endl;
-        tree_lh = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
-        for (ptn = 0; ptn < orig_nptn; ptn++) {
-            if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
-                _pattern_lh[ptn] = LOG_SCALING_THRESHOLD*4; // log(2^(-1024))
-            }
-            tree_lh += _pattern_lh[ptn] * ptn_freq[ptn];
-        }
-    }
+	tree_lh += horizontal_add(lh_final) + buffer_scale_all;
+
+    assert(!isnan(tree_lh) && !isinf(tree_lh) && "Numerical underflow for SIMD lh-FromBuffer");
 
 	if (orig_nptn < nptn) {
 		// ascertaiment bias correction
@@ -1132,41 +1320,43 @@ double PhyloTree::computeLikelihoodFromBufferEigenSIMD() {
 		double prob_const;// df_const, ddf_const;
 		double *theta = &theta_all[orig_nptn*block];
 
-        UBYTE sum_scale_num[nstates+VCSIZE];
+        UBYTE sum_scale_num[(nstates+VCSIZE)*ncat_mix];
         memset(sum_scale_num, 0, sizeof(UBYTE)*(nstates+VCSIZE));
         if (current_it->node->isLeaf())
-            memcpy(sum_scale_num, current_it_back->scale_num+orig_nptn, sizeof(UBYTE)*(nptn-orig_nptn));
+            memcpy(sum_scale_num, current_it_back->scale_num+orig_nptn*ncat_mix, sizeof(UBYTE)*(nptn-orig_nptn)*ncat_mix);
         else if (current_it_back->node->isLeaf())
-            memcpy(sum_scale_num, current_it->scale_num+orig_nptn, sizeof(UBYTE)*(nptn-orig_nptn));
+            memcpy(sum_scale_num, current_it->scale_num+orig_nptn*ncat_mix, sizeof(UBYTE)*(nptn-orig_nptn)*ncat_mix);
         else {
-            for (ptn = orig_nptn; ptn < nptn; ptn++)
-                sum_scale_num[ptn-orig_nptn] = current_it->scale_num[ptn] + current_it_back->scale_num[ptn];
+            UBYTE *cur_scale_num = current_it->scale_num + orig_nptn*ncat_mix;
+            UBYTE *back_scale_num = current_it_back->scale_num + orig_nptn*ncat_mix;
+            c = (nptn-orig_nptn)*ncat_mix;
+            for (i = 0; i < c; i++)
+                sum_scale_num[i] = cur_scale_num[i] + back_scale_num[i];
         }
 
-        for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
-			lh_final += lh_ptn;
-
+        for (ptn = orig_nptn; ptn < nptn; ptn++) {
+			//lh_final += lh_ptn;
 			// initialization
-			for (i = 0; i < VCSIZE; i++) {
-				vc_ptn[i] = vc_val0[0] * VectorClass().load_a(theta+i*block);
-			}
+            VectorClass this_vc_ptn = vc_val0[0] * VectorClass().load_a(theta);
+
+            UBYTE *this_sum_scale = sum_scale_num + (ptn-orig_nptn)*ncat_mix;
+            UBYTE min_scale = *min_element(this_sum_scale, this_sum_scale + ncat_mix);
 
 			for (i = 1; i < block/VCSIZE; i++) {
-				for (j = 0; j < VCSIZE; j++) {
-					vc_ptn[j] = mul_add(VectorClass().load_a(&theta[i*VCSIZE+j*block]), vc_val0[i], vc_ptn[j]);
-				}
+                this_vc_ptn = mul_add(VectorClass().load_a(&theta[i*VCSIZE]), vc_val0[i], this_vc_ptn);
 			}
-			theta += block*VCSIZE;
+			theta += block;
 
             // bugfix 2016-01-21, prob_const can be rescaled
-            for (j = 0; j < VCSIZE; j++)
-                if (sum_scale_num[ptn+j-orig_nptn] >= 1)
-                    vc_ptn[j] = vc_ptn[j] * SCALING_THRESHOLD;
+            if (min_scale >= 1)
+                this_vc_ptn *= VectorClass(SCALING_THRESHOLD);
 
-			// ptn_invar[ptn] is not aligned
-			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
+			// no +I for +ASC!
+			prob_const = horizontal_add(this_vc_ptn);
 
 		}
+
+        /*
 		switch ((nptn-orig_nptn) % VCSIZE) {
 		case 0:
 			prob_const = horizontal_add(lh_final+lh_ptn);
@@ -1184,6 +1374,7 @@ double PhyloTree::computeLikelihoodFromBufferEigenSIMD() {
 			assert(0);
 			break;
 		}
+        */
     	prob_const = log(1.0 - prob_const);
     	tree_lh -= aln->getNSite() * prob_const;
     	for (ptn = 0; ptn < orig_nptn; ptn++)
@@ -1270,8 +1461,8 @@ void PhyloTree::computePartialParsimonyFastSIMD(PhyloNeighbor *dad_branch, Phylo
     if (dad_branch->partial_lh_computed & 2)
         return;
     Node *node = dad_branch->node;
-    int nstates = aln->num_states;
-    int site;
+    int nstates = aln->getMaxNumStates();
+    int site = 0;
     const int VCSIZE = VectorClass::size();
     const int NUM_BITS = VectorClass::size() * UINT_BITS;
 
@@ -1279,130 +1470,144 @@ void PhyloTree::computePartialParsimonyFastSIMD(PhyloNeighbor *dad_branch, Phylo
 
     if (node->isLeaf() && dad) {
         // external node
+        vector<Alignment*> *partitions = NULL;
+        if (aln->isSuperAlignment())
+            partitions = &((SuperAlignment*)aln)->partitions;
+        else {
+            partitions = new vector<Alignment*>;
+            partitions->push_back(aln);
+        }
         if (aln->ordered_pattern.empty())
             aln->orderPatternByNumChars();
         int leafid = node->id;
         int pars_size = getBitsBlockSize();
         memset(dad_branch->partial_pars, 0, pars_size*sizeof(UINT));
-//        int ptn;
-//        int nptn = aln->size();
     	int ambi_aa[] = {2, 3, 5, 6, 9, 10}; // {4+8, 32+64, 512+1024};
-//        int max_sites = ((aln->num_informative_sites+UINT_BITS-1)/UINT_BITS)*UINT_BITS;
-//        UINT *x = dad_branch->partial_pars - (nstates*VCSIZE);
         UINT *x = dad_branch->partial_pars;
-        Alignment::iterator pat;
-    	switch (aln->seq_type) {
-    	case SEQ_DNA:
-            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
-            	int state = pat->at(leafid);
-                int freq = pat->frequency;
-                if (state < 4) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += 4*VCSIZE;
-                            site = 0;
+        int start_pos = 0;
+
+        for (vector<Alignment*>::iterator alnit = partitions->begin(); alnit != partitions->end(); alnit++) {
+            int end_pos = start_pos + (*alnit)->ordered_pattern.size();
+            switch ((*alnit)->seq_type) {
+            case SEQ_DNA:
+                for (int patid = start_pos; patid != end_pos; patid++) {
+                    Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
+                    int state = pat->at(leafid);
+                    int freq = pat->frequency;
+                    if (state < 4) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
                         }
-                        x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
-                    }
-                } else if (state == aln->STATE_UNKNOWN) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += 4*VCSIZE;
-                            site = 0;
+                    } else if (state == (*alnit)->STATE_UNKNOWN) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            UINT bit1 = (1 << (site%UINT_BITS));
+                            UINT *p = x+(site/UINT_BITS);
+                            p[0] |= bit1;
+                            p[VCSIZE] |= bit1;
+                            p[2*VCSIZE] |= bit1;
+                            p[3*VCSIZE] |= bit1;
                         }
-                        UINT bit1 = (1 << (site%UINT_BITS));
-                        UINT *p = x+(site/UINT_BITS);
-                        p[0] |= bit1;
-                        p[VCSIZE] |= bit1;
-                        p[2*VCSIZE] |= bit1;
-                        p[3*VCSIZE] |= bit1;
-                    }
-                } else {
-                	state -= 3;
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += 4*VCSIZE;
-                            site = 0;
+                    } else {
+                        state -= 3;
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            UINT *p = x + ((site/UINT_BITS));
+                            
+                            UINT bit1 = (1 << (site%UINT_BITS));
+                            for (int i = 0; i < 4; i++)
+                                if (state & (1<<i))
+                                    p[i*VCSIZE] |= bit1;
                         }
-                        UINT *p = x + ((site/UINT_BITS));
-                        
-                        UINT bit1 = (1 << (site%UINT_BITS));
-                        for (int i = 0; i < 4; i++)
-                            if (state & (1<<i))
-                                p[i*VCSIZE] |= bit1;
                     }
                 }
-            }
-    		break;
-    	case SEQ_PROTEIN:
-            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
-            	int state = pat->at(leafid);
-                int freq = pat->frequency;
-                if (state < 20) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += 20*VCSIZE;
-                            site = 0;
+                break;
+            case SEQ_PROTEIN:
+                for (int patid = start_pos; patid != end_pos; patid++) {
+                    Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
+                    int state = pat->at(leafid);
+                    int freq = pat->frequency;
+                    if (state < 20) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
                         }
-                        x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
-                    }
-                } else if (state == aln->STATE_UNKNOWN) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += 20*VCSIZE;
-                            site = 0;
+                    } else if (state == (*alnit)->STATE_UNKNOWN) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            UINT bit1 = (1 << (site%UINT_BITS));
+                            UINT *p = x+(site/UINT_BITS);
+                            for (int i = 0; i < 20; i++)
+                                p[i*VCSIZE] |= bit1;
                         }
-                        UINT bit1 = (1 << (site%UINT_BITS));
-                        UINT *p = x+(site/UINT_BITS);
-                        for (int i = 0; i < 20; i++)
-                            p[i*VCSIZE] |= bit1;
-                    }
-                } else {
-                	assert(state < 23);
-            		state = (state-20)*2;
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += 20*VCSIZE;
-                            site = 0;
+                    } else {
+                        assert(state < 23);
+                        state = (state-20)*2;
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            UINT *p = x + ((site/UINT_BITS));
+                            UINT bit1 = (1 << (site%UINT_BITS));
+
+                            p[ambi_aa[state]*VCSIZE] |= bit1;
+                            p[ambi_aa[state+1]*VCSIZE] |= bit1;
                         }
-                        UINT *p = x + ((site/UINT_BITS));
-                        UINT bit1 = (1 << (site%UINT_BITS));
-
-                        p[ambi_aa[state]*VCSIZE] |= bit1;
-                        p[ambi_aa[state+1]*VCSIZE] |= bit1;
                     }
                 }
-            }
-    		break;
-    	default:
-            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
-            	int state = pat->at(leafid);
-                int freq = pat->frequency;
-                if (state < nstates) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += nstates*VCSIZE;
-                            site = 0;
+                break;
+            default:
+                for (int patid = start_pos; patid != end_pos; patid++) {
+                    Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
+                    int state = pat->at(leafid);
+                    int freq = pat->frequency;
+                    if (state < (*alnit)->num_states) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
                         }
-                        x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
-                    }
-                } else if (state == aln->STATE_UNKNOWN) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        if (site == NUM_BITS) {
-                            x += nstates*VCSIZE;
-                            site = 0;
+                    } else if (state == (*alnit)->STATE_UNKNOWN) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            if (site == NUM_BITS) {
+                                x += nstates*VCSIZE;
+                                site = 0;
+                            }
+                            UINT bit1 = (1 << (site%UINT_BITS));
+                            UINT *p = x+(site/UINT_BITS);
+                            for (int i = 0; i < (*alnit)->num_states; i++)
+                                p[i*VCSIZE] |= bit1;
                         }
-                        UINT bit1 = (1 << (site%UINT_BITS));
-                        UINT *p = x+(site/UINT_BITS);
-                        for (int i = 0; i < nstates; i++)
-                            p[i*VCSIZE] |= bit1;
+                    } else {
+                        assert(0);
                     }
-                } else {
-                	assert(0);
                 }
-            }
-    		break;
-    	}
+                break;
+            } // end of switch
+            start_pos = end_pos;
+        } // of end FOR LOOP
+
+        assert(start_pos == aln->ordered_pattern.size());
+//        assert(site == aln->num_informative_sites % NUM_BITS);
         // add dummy states
         if (site > 0 && site < NUM_BITS) {
             x += site/UINT_BITS;
@@ -1411,6 +1616,8 @@ void PhyloTree::computePartialParsimonyFastSIMD(PhyloNeighbor *dad_branch, Phylo
             int max_sites = ((site+UINT_BITS-1)/UINT_BITS);
             memset(x, 255, (VCSIZE - max_sites)*sizeof(UINT));
         }
+        if (!aln->isSuperAlignment())
+            delete partitions;
     } else {
         // internal node
         assert(node->degree() == 3); // it works only for strictly bifurcating tree
@@ -1433,7 +1640,7 @@ void PhyloTree::computePartialParsimonyFastSIMD(PhyloNeighbor *dad_branch, Phylo
             #pragma omp parallel for private (site) reduction(+: score) if(nsites>200)
             #endif
 			for (site = 0; site<nsites; site++) {
-                size_t offset = 4*VCSIZE*site;
+                size_t offset = entry_size*site;
                 VectorClass *x = (VectorClass*)(left->partial_pars + offset);
                 VectorClass *y = (VectorClass*)(right->partial_pars + offset);
                 VectorClass *z = (VectorClass*)(dad_branch->partial_pars + offset);
@@ -1504,7 +1711,7 @@ int PhyloTree::computeParsimonyBranchFastSIMD(PhyloNeighbor *dad_branch, PhyloNo
     if ((node_branch->partial_lh_computed & 2) == 0)
         computePartialParsimonyFastSIMD<VectorClass>(node_branch, node);
     int site;
-    int nstates = aln->num_states;
+    int nstates = aln->getMaxNumStates();
 
 //    VectorClass score = 0;
 //    VectorClass w;
@@ -1576,4 +1783,4 @@ int PhyloTree::computeParsimonyBranchFastSIMD(PhyloNeighbor *dad_branch, PhyloNo
 }
 
 
-#endif /* PHYLOKERNEL_H_ */
+#endif /* PHYLOKERNELSAFE_H_ */
diff --git a/phylokernelsitemodel.cpp b/phylokernelsitemodel.cpp
index d681d6f..da70ef0 100644
--- a/phylokernelsitemodel.cpp
+++ b/phylokernelsitemodel.cpp
@@ -590,7 +590,7 @@ double PhyloTree::computeSitemodelLikelihoodBranchEigen(PhyloNeighbor *dad_branc
     size_t nptn = aln->size();
 
 
-	memset(_pattern_lh_cat, 0, nptn*ncat*sizeof(double));
+	memset(_pattern_lh_cat, 0, sizeof(double)*nptn*ncat);
     ModelSet *models = (ModelSet*)model;
 
     if (dad->isLeaf()) {
@@ -610,7 +610,7 @@ double PhyloTree::computeSitemodelLikelihoodBranchEigen(PhyloNeighbor *dad_branc
                 double len = site_rate->getRate(c)*dad_branch->length;
                 double prop = site_rate->getProp(c);
 				for (i = 0; i < nstates; i++) {
-					*lh_cat +=  exp(eval[i]*len) * partial_lh_node[i] * partial_lh_dad[i];
+					*lh_cat +=  (exp(eval[i]*len) * partial_lh_node[i] * partial_lh_dad[i]);
 				}
                 *lh_cat *= prop;
 				lh_ptn += *lh_cat;
@@ -640,7 +640,7 @@ double PhyloTree::computeSitemodelLikelihoodBranchEigen(PhyloNeighbor *dad_branc
                 double len = site_rate->getRate(c)*dad_branch->length;
                 double prop = site_rate->getProp(c);
 				for (i = 0; i < nstates; i++) {
-					*lh_cat +=  exp(eval[i]*len) * partial_lh_node[i] * partial_lh_dad[i];
+					*lh_cat +=  (exp(eval[i]*len) * partial_lh_node[i] * partial_lh_dad[i]);
 				}
                 *lh_cat *= prop;
 				lh_ptn += *lh_cat;
diff --git a/phylokernelsse.cpp b/phylokernelsse.cpp
new file mode 100644
index 0000000..bb3ee7f
--- /dev/null
+++ b/phylokernelsse.cpp
@@ -0,0 +1,169 @@
+/*
+ * phylokernelavx.cpp
+ *
+ *  Created on: Sept 25, 2016
+ *      Author: minh
+ */
+
+
+#include "vectorclass/vectormath_exp.h"
+#include "vectorclass/vectorclass.h"
+#include "phylokernel.h"
+//#include "phylokernelsafe.h"
+//#include "phylokernelmixture.h"
+//#include "phylokernelmixrate.h"
+//#include "phylokernelsitemodel.h"
+
+#include "phylokernelnew.h"
+#define KERNEL_FIX_STATES
+#include "phylokernelnew.h"
+
+
+#if !defined ( __SSE2__ ) && !defined ( __x86_64__ )
+#error "You must compile this file with SSE2 enabled!"
+#endif
+
+void PhyloTree::setParsimonyKernelSSE() {
+	computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchFastSIMD<Vec4ui>;
+    computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyFastSIMD<Vec4ui>;
+}
+
+void PhyloTree::setDotProductSSE() {
+#ifdef BOOT_VAL_FLOAT
+		dotProduct = &PhyloTree::dotProductSIMD<float, Vec4f>;
+#else
+		dotProduct = &PhyloTree::dotProductSIMD<double, Vec2d>;
+#endif
+        dotProductDouble = &PhyloTree::dotProductSIMD<double, Vec2d>;
+}
+
+void PhyloTree::setLikelihoodKernelSSE() {
+    vector_size = 2;
+    setParsimonyKernelSSE();
+
+    if (model_factory && model_factory->model->isSiteSpecificModel() && (params->lk_safe_scaling || leafNum >= params->numseq_safe_scaling)) {
+        switch (aln->num_states) {
+        case 4:
+            computeLikelihoodBranchPointer     = &PhyloTree::computeLikelihoodBranchSIMD    <Vec2d, SAFE_LH, 4, false, true>;
+            computeLikelihoodDervPointer       = &PhyloTree::computeLikelihoodDervSIMD      <Vec2d, SAFE_LH, 4, false, true>;
+            computePartialLikelihoodPointer    =  &PhyloTree::computePartialLikelihoodSIMD  <Vec2d, SAFE_LH, 4, false, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec2d, SAFE_LH, 4, false, true>;
+            break;
+        case 20:
+            computeLikelihoodBranchPointer     = &PhyloTree::computeLikelihoodBranchSIMD    <Vec2d, SAFE_LH, 20, false, true>;
+            computeLikelihoodDervPointer       = &PhyloTree::computeLikelihoodDervSIMD      <Vec2d, SAFE_LH, 20, false, true>;
+            computePartialLikelihoodPointer    = &PhyloTree::computePartialLikelihoodSIMD   <Vec2d, SAFE_LH, 20, false, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec2d, SAFE_LH, 20, false, true>;
+            break;
+        default:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD        <Vec2d, SAFE_LH, false, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD            <Vec2d, SAFE_LH, false, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD      <Vec2d, SAFE_LH, false, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec2d, SAFE_LH, false, true>;
+            break;
+        }
+        return;
+    }
+
+    if (model_factory && model_factory->model->isSiteSpecificModel()) {
+        switch (aln->num_states) {
+        case 4:
+            computeLikelihoodBranchPointer     = &PhyloTree::computeLikelihoodBranchSIMD    <Vec2d, NORM_LH, 4, false, true>;
+            computeLikelihoodDervPointer       = &PhyloTree::computeLikelihoodDervSIMD      <Vec2d, NORM_LH, 4, false, true>;
+            computePartialLikelihoodPointer    =  &PhyloTree::computePartialLikelihoodSIMD  <Vec2d, NORM_LH, 4, false, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec2d, NORM_LH, 4, false, true>;
+            break;
+        case 20:
+            computeLikelihoodBranchPointer     = &PhyloTree::computeLikelihoodBranchSIMD    <Vec2d, NORM_LH, 20, false, true>;
+            computeLikelihoodDervPointer       = &PhyloTree::computeLikelihoodDervSIMD      <Vec2d, NORM_LH, 20, false, true>;
+            computePartialLikelihoodPointer    = &PhyloTree::computePartialLikelihoodSIMD   <Vec2d, NORM_LH, 20, false, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec2d, NORM_LH, 20, false, true>;
+            break;
+        default:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD        <Vec2d, NORM_LH, false, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD            <Vec2d, NORM_LH, false, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD      <Vec2d, NORM_LH, false, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec2d, NORM_LH, false, true>;
+            break;
+        }
+        return;
+    }
+
+    if (params->lk_safe_scaling || leafNum >= params->numseq_safe_scaling) {
+	switch(aln->num_states) {
+        /*
+        case 2:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec2d, SAFE_LH, 2>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec2d, SAFE_LH, 2>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec2d, SAFE_LH, 2>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec2d, SAFE_LH, 2>;
+            break;
+        */
+        case 4:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec2d, SAFE_LH, 4>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec2d, SAFE_LH, 4>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec2d, SAFE_LH, 4>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec2d, SAFE_LH, 4>;
+            break;
+        case 20:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec2d, SAFE_LH, 20>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec2d, SAFE_LH, 20>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec2d, SAFE_LH, 20>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec2d, SAFE_LH, 20>;
+            break;
+        /*
+        case 64:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec2d, SAFE_LH, 64>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec2d, SAFE_LH, 64>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec2d, SAFE_LH, 64>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec2d, SAFE_LH, 64>;
+            break;
+        */
+        default:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD<Vec2d, SAFE_LH>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD<Vec2d, SAFE_LH>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD<Vec2d, SAFE_LH>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec2d, SAFE_LH>;
+            break;
+        }
+        return;
+    }
+
+	switch(aln->num_states) {
+    /*
+	case 2:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec2d, NORM_LH, 2>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec2d, NORM_LH, 2>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec2d, NORM_LH, 2>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec2d, NORM_LH, 2>;
+		break;
+    */
+	case 4:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec2d, NORM_LH, 4>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec2d, NORM_LH, 4>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec2d, NORM_LH, 4>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec2d, NORM_LH, 4>;
+		break;
+	case 20:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec2d, NORM_LH, 20>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec2d, NORM_LH, 20>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec2d, NORM_LH, 20>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec2d, NORM_LH, 20>;
+		break;
+    /*
+	case 64:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec2d, NORM_LH, 64>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec2d, NORM_LH, 64>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec2d, NORM_LH, 64>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec2d, NORM_LH, 64>;
+		break;
+    */
+	default:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD<Vec2d, NORM_LH>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD<Vec2d, NORM_LH>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD<Vec2d, NORM_LH>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec2d, NORM_LH>;
+		break;
+	}
+}
+
diff --git a/phylonode.cpp b/phylonode.cpp
index fa39d84..c33de75 100644
--- a/phylonode.cpp
+++ b/phylonode.cpp
@@ -19,33 +19,15 @@ void PhyloNeighbor::clearForwardPartialLh(Node *dad) {
 			((PhyloNeighbor*)*it)->clearForwardPartialLh(node);
 }
 
-void PhyloNeighbor::reorientPartialLh(Node *dad) {
-    if (partial_lh)
-        return;
-    bool done = false;
-    FOR_NEIGHBOR_IT(node, dad, it) {
-        PhyloNeighbor *backnei = (PhyloNeighbor*)(*it)->node->findNeighbor(node);
-        if (backnei->partial_lh) {
-            partial_lh = backnei->partial_lh;
-            scale_num = backnei->scale_num;
-            backnei->partial_lh = NULL;
-            backnei->scale_num = NULL;
-            backnei->partial_lh_computed &= ~1; // clear bit
-            done = true;
-            break;
-        }
-    }
-    assert(done && "partial_lh is not re-oriented");
-}
-
-
 void PhyloNode::clearReversePartialLh(PhyloNode *dad) {
 //	PhyloNeighbor *node_nei = (PhyloNeighbor*)findNeighbor(dad);
 //	assert(node_nei);
 //	node_nei->partial_lh_computed = 0;
 	for (NeighborVec::iterator it = neighbors.begin(); it != neighbors.end(); it ++)
 		if ((*it)->node != dad) {
-			((PhyloNeighbor*)(*it)->node->findNeighbor(this))->partial_lh_computed = 0;
+            PhyloNeighbor *nei = (PhyloNeighbor*)(*it)->node->findNeighbor(this);
+			nei->partial_lh_computed = 0;
+            nei->size = 0;
 			((PhyloNode*)(*it)->node)->clearReversePartialLh(this);
 		}
 }
@@ -55,10 +37,17 @@ void PhyloNode::clearAllPartialLh(bool make_null, PhyloNode *dad) {
 	node_nei->partial_lh_computed = 0;
 	if (make_null) node_nei->partial_lh = NULL;
 
+
+    if (Params::getInstance().lh_mem_save == LM_MEM_SAVE)
+        node_nei->size = 0;
+
 	node_nei = (PhyloNeighbor*)dad->findNeighbor(this);
 	node_nei->partial_lh_computed = 0;
 	if (make_null) node_nei->partial_lh = NULL;
 
+    if (Params::getInstance().lh_mem_save == LM_MEM_SAVE)
+        node_nei->size = 0;
+
 	for (NeighborVec::iterator it = neighbors.begin(); it != neighbors.end(); it ++)
 		if ((*it)->node != dad)
 			((PhyloNode*)(*it)->node)->clearAllPartialLh(make_null, this);
@@ -94,3 +83,21 @@ void PhyloNode::init() {
 void PhyloNode::addNeighbor(Node *node, double length, int id) {
 	neighbors.push_back(new PhyloNeighbor(node, length, id));
 }
+
+
+int PhyloNode::computeSize(Node *dad) {
+    PhyloNeighbor *nei = (PhyloNeighbor*)dad->findNeighbor(this);
+    if (nei->size > 0)
+        return nei->size;
+
+    if (isLeaf()) {
+        nei->size = 1;
+        return nei->size;
+    }
+    nei->size = 0;
+    FOR_NEIGHBOR_IT(this, dad, it) {
+        nei->size += ((PhyloNode*)(*it)->node)->computeSize(this);
+    }
+    return nei->size;
+}
+
diff --git a/phylonode.h b/phylonode.h
index 5bc4563..0e9ff79 100644
--- a/phylonode.h
+++ b/phylonode.h
@@ -14,7 +14,7 @@
 
 #include "node.h"
 
-typedef short int UBYTE;
+typedef unsigned short UBYTE;
 
 /**
 A neighbor in a phylogenetic tree
@@ -26,6 +26,7 @@ class PhyloNeighbor : public Neighbor {
     friend class PhyloTree;
     friend class IQTree;
     friend class PhyloSuperTree;
+    friend class MemSlotVector;
 
 public:
     friend class TinaTree;
@@ -42,6 +43,7 @@ public:
         partial_lh_computed = 0;
         lh_scale_factor = 0.0;
         partial_pars = NULL;
+        size = 0;
     }
 
     /**
@@ -56,6 +58,7 @@ public:
         partial_lh_computed = 0;
         lh_scale_factor = 0.0;
         partial_pars = NULL;
+        size = 0;
     }
 
     /**
@@ -79,10 +82,11 @@ public:
     void clearForwardPartialLh(Node *dad);
 
     /**
+        DEPRECATED, moved to PhyloTree
         if partial_lh is NULL, reorient partial_lh (LM_PER_NODE technique)
         @param dad dad of this neighbor
     */
-    void reorientPartialLh(Node *dad);
+//    void reorientPartialLh(Node *dad);
 
 	/**
 	* For Upper Bounds analysis: get partial likelihood and lh scale factor
@@ -99,6 +103,10 @@ public:
 	return partial_lh_computed;
 	}
 
+    int getSize() {
+        return size;
+    }
+
 private:
 
     /**
@@ -126,6 +134,9 @@ private:
      */
     UINT *partial_pars;
 
+    /** size of subtree below this neighbor in terms of number of taxa */
+    int size;
+
 };
 
 /**
@@ -189,6 +200,13 @@ public:
 
     void computeReversePartialLh(PhyloNode *dad);
 
+    /** 
+        compute the size (#taxa) of the subtree rooted at this node
+        using buffered 'size' attribute if computed beforehand
+        @param dad dad of this node
+    */
+    int computeSize(Node *dad);
+
 };
 
 
diff --git a/phylosupertree.cpp b/phylosupertree.cpp
index 9dbf3e1..8fb8918 100644
--- a/phylosupertree.cpp
+++ b/phylosupertree.cpp
@@ -551,8 +551,9 @@ PhyloSuperTree::PhyloSuperTree(Params &params) :  IQTree() {
     
 #ifdef _OPENMP
     if (params.num_threads > size()) {
-        outWarning("More threads (" + convertIntToString(params.num_threads) + ") than number of partitions (" + convertIntToString(size()) + ") might not be necessary.");
-        outWarning("You are recommended to rerun with '-nt " + convertIntToString(size()) + "' and see if this is faster");
+        cout << "Info: multi-threading strategy over alignment sites" << endl;
+    } else {
+        cout << "Info: multi-threading strategy over partitions" << endl;
     }
 #endif
 	cout << endl;
@@ -568,34 +569,23 @@ void PhyloSuperTree::setParams(Params* params) {
 
 void PhyloSuperTree::initSettings(Params &params) {
 	IQTree::initSettings(params);
+    num_threads = (size() >= params.num_threads) ? params.num_threads : 1;
 	for (iterator it = begin(); it != end(); it++) {
 		(*it)->params = ¶ms;
-		(*it)->setLikelihoodKernel(params.SSE);
+		(*it)->setLikelihoodKernel(params.SSE, (size() >= params.num_threads) ? 1 : params.num_threads);
 		(*it)->optimize_by_newton = params.optimize_by_newton;
 	}
 
 }
 
-void PhyloSuperTree::setLikelihoodKernel(LikelihoodKernel lk) {
-    PhyloTree::setLikelihoodKernel(lk);
+void PhyloSuperTree::setLikelihoodKernel(LikelihoodKernel lk, int num_threads) {
+    PhyloTree::setLikelihoodKernel(lk, (size() >= num_threads) ? num_threads : 1);
     for (iterator it = begin(); it != end(); it++)
-        (*it)->setLikelihoodKernel(lk);    
+        (*it)->setLikelihoodKernel(lk, (size() >= num_threads) ? 1 : num_threads);
 }
 
 void PhyloSuperTree::changeLikelihoodKernel(LikelihoodKernel lk) {
 	PhyloTree::changeLikelihoodKernel(lk);
-//	if ((sse == LK_EIGEN || sse == LK_EIGEN_SSE) && (lk == LK_NORMAL || lk == LK_SSE)) {
-//		// need to increase the memory usage when changing from new kernel to old kernel
-//        setLikelihoodKernel(lk);
-//        for (iterator it = begin(); it != end(); it++)
-//            (*it)->setLikelihoodKernel(lk);
-//		deleteAllPartialLh();
-//		initializeAllPartialLh();
-//		clearAllPartialLH();
-//    } else {
-//        for (iterator it = begin(); it != end(); it++)
-//            (*it)->setLikelihoodKernel(lk);
-//    }
 }
 
 string PhyloSuperTree::getTreeString() {
@@ -891,7 +881,7 @@ void PhyloSuperTree::clearAllPartialLH(bool make_null) {
     }
 }
 
-int PhyloSuperTree::computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
+int PhyloSuperTree::computeParsimonyBranchObsolete(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
     int score = 0, part = 0;
     SuperNeighbor *dad_nei = (SuperNeighbor*)dad_branch;
     SuperNeighbor *node_nei = (SuperNeighbor*)(dad_branch->node->findNeighbor(dad));
@@ -967,7 +957,7 @@ double PhyloSuperTree::computeLikelihood(double *pattern_lh) {
 	} else {
         if (part_order.empty()) computePartitionOrder();
 		#ifdef _OPENMP
-		#pragma omp parallel for reduction(+: tree_lh) schedule(dynamic) if(ntrees >= params->num_threads)
+		#pragma omp parallel for reduction(+: tree_lh) schedule(dynamic) if(num_threads > 1)
 		#endif
 		for (int j = 0; j < ntrees; j++) {
             int i = part_order[j];
@@ -1028,7 +1018,7 @@ double PhyloSuperTree::optimizeAllBranches(int my_iterations, double tolerance,
 	int ntrees = size();
     if (part_order.empty()) computePartitionOrder();
 	#ifdef _OPENMP
-	#pragma omp parallel for reduction(+: tree_lh) schedule(dynamic) if(ntrees >= params->num_threads)
+	#pragma omp parallel for reduction(+: tree_lh) schedule(dynamic) if(num_threads > 1)
 	#endif
 	for (int j = 0; j < ntrees; j++) {
         int i = part_order[j];
@@ -1123,6 +1113,26 @@ NNIMove PhyloSuperTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NN
 		break;
 	}
 
+    // check for compatibility with constraint tree
+    bool nni_ok[2] = {true, true};
+    int nniid = 0;
+	FOR_NEIGHBOR(node2, node1, node2_it) {
+        NNIMove nni;
+        nni.node1 = node1;
+        nni.node2 = node2;
+        nni.node1Nei_it = node1->findNeighborIt(node1_nei->node);
+        nni.node2Nei_it = node2_it;
+        nni_ok[nniid++] = constraintTree.isCompatible(nni);
+    }
+    assert(nniid == 2);
+    myMove.node1 = myMove.node2 = NULL;
+    myMove.newloglh = -DBL_MAX;
+    // return if both NNIs do not satisfy constraint
+    if (!nni_ok[0] && !nni_ok[1]) {
+        assert(!nniMoves);
+        return myMove;
+    }
+
 	//double bestScore = optimizeOneBranch(node1, node2, false);
 
 	int ntrees = size(), part;
@@ -1131,7 +1141,7 @@ NNIMove PhyloSuperTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NN
 
     if (part_order.empty()) computePartitionOrder();
 	#ifdef _OPENMP
-	#pragma omp parallel for reduction(+: nni_score1, nni_score2, local_totalNNIs, local_evalNNIs) private(part) schedule(dynamic) if(ntrees >= params->num_threads)
+	#pragma omp parallel for reduction(+: nni_score1, nni_score2, local_totalNNIs, local_evalNNIs) private(part) schedule(dynamic) if(num_threads>1)
 	#endif
 	for (int treeid = 0; treeid < ntrees; treeid++) {
         part = part_order_by_nptn[treeid];
@@ -1201,6 +1211,9 @@ NNIMove PhyloSuperTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NN
 	totalNNIs += local_totalNNIs;
 	evalNNIs += local_evalNNIs;
 	double nni_scores[2] = {nni_score1, nni_score2};
+    
+    if (!nni_ok[0]) nni_scores[0] = -DBL_MAX;
+    if (!nni_ok[1]) nni_scores[1] = -DBL_MAX;
 
 	myMove.node1Nei_it = node1->findNeighborIt(node1_nei->node);
 	myMove.node1 = node1;
@@ -1221,8 +1234,9 @@ NNIMove PhyloSuperTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NN
     //now setup pattern likelihoods per partition
 	double *save_lh_factor = new double [ntrees];
 	double *save_lh_factor_back = new double [ntrees];
-	int nnino = 0;
-	FOR_NEIGHBOR(node2, node1, node2_it) {
+	nniid = 0;
+	FOR_NEIGHBOR(node2, node1, node2_it) if (nni_ok[nniid]) 
+    {
 
 		// do the NNI
 		node2_nei = (SuperNeighbor*)(*node2_it);
@@ -1242,18 +1256,18 @@ NNIMove PhyloSuperTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NN
 			if (!is_nni)
 				memcpy(at(part)->_pattern_lh, part_info[part].cur_ptnlh, at(part)->getAlnNPattern() * sizeof(double));
 			else
-				memcpy(at(part)->_pattern_lh, part_info[part].nniMoves[nnino].ptnlh, at(part)->getAlnNPattern() * sizeof(double));
+				memcpy(at(part)->_pattern_lh, part_info[part].nniMoves[nniid].ptnlh, at(part)->getAlnNPattern() * sizeof(double));
     		save_lh_factor[part] = at(part)->current_it->lh_scale_factor;
     		save_lh_factor_back[part] = at(part)->current_it_back->lh_scale_factor;
     		at(part)->current_it->lh_scale_factor = 0.0;
     		at(part)->current_it_back->lh_scale_factor = 0.0;
         }
         if (nniMoves) {
-        	nniMoves[nnino].newloglh = nni_scores[nnino];
-       		computePatternLikelihood(nniMoves[nnino].ptnlh, &nni_scores[nnino]);
+        	nniMoves[nniid].newloglh = nni_scores[nniid];
+       		computePatternLikelihood(nniMoves[nniid].ptnlh, &nni_scores[nniid]);
         }
         if (save_all_trees == 2)
-        	saveCurrentTree(nni_scores[nnino]);
+        	saveCurrentTree(nni_scores[nniid]);
 
         // restore information
         for (part = 0; part < ntrees; part++) {
@@ -1266,7 +1280,7 @@ NNIMove PhyloSuperTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NN
         node1_nei->node->updateNeighbor(node2, node1);
         node2->updateNeighbor(node2_it, node2_nei);
         node2_nei->node->updateNeighbor(node1, node2);
-        nnino++;
+        nniid++;
 
 	}
 
@@ -1433,12 +1447,12 @@ PhyloTree *PhyloSuperTree::extractSubtree(IntVector &ids) {
 	return tree;
 }
 
-uint64_t PhyloSuperTree::getMemoryRequired(size_t ncategory) {
+uint64_t PhyloSuperTree::getMemoryRequired(size_t ncategory, bool full_mem) {
 //	uint64_t mem_size = PhyloTree::getMemoryRequired(ncategory);
 	// supertree does not need any memory for likelihood vectors!
 	uint64_t mem_size = 0;
 	for (iterator it = begin(); it != end(); it++)
-		mem_size += (*it)->getMemoryRequired(ncategory);
+		mem_size += (*it)->getMemoryRequired(ncategory, full_mem);
 	return mem_size;
 }
 
diff --git a/phylosupertree.h b/phylosupertree.h
index e850977..d21ea97 100644
--- a/phylosupertree.h
+++ b/phylosupertree.h
@@ -96,7 +96,7 @@ public:
 	 */
 	virtual void initSettings(Params& params);
 
-    virtual void setLikelihoodKernel(LikelihoodKernel lk);
+    virtual void setLikelihoodKernel(LikelihoodKernel lk, int num_threads);
 
     virtual void changeLikelihoodKernel(LikelihoodKernel lk);
 
@@ -265,6 +265,7 @@ public:
     virtual void changeNNIBrans(NNIMove nnimove);
 
     /**
+        OBSOLETE!
      * 	 Restore the branch lengths from the saved values
 	 * @param node the current node of the post-order tree traversal
 	 * @param dad the dad of that node used to direct the traversal
@@ -321,7 +322,7 @@ public:
      * compute the memory size required for storing partial likelihood vectors
      * @return memory size required in bytes
      */
-    virtual uint64_t getMemoryRequired(size_t ncategory = 1);
+    virtual uint64_t getMemoryRequired(size_t ncategory = 1, bool full_mem = false);
 
     /**
      * count the number of super branches that map to no branches in gene trees
@@ -338,7 +339,7 @@ public:
      */
     virtual int fixNegativeBranch(bool force = false, Node *node = NULL, Node *dad = NULL);
 
-    virtual int computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
+    virtual int computeParsimonyBranchObsolete(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
 
     /** True when mixed codon with other data type */
     bool rescale_codon_brlen;
diff --git a/phylosupertreeplen.cpp b/phylosupertreeplen.cpp
index 929be9a..753d84f 100644
--- a/phylosupertreeplen.cpp
+++ b/phylosupertreeplen.cpp
@@ -129,7 +129,7 @@ double PartitionModelPlen::optimizeParameters(int fixed_len, bool write_info, do
     	cur_lh = 0.0;
         if (tree->part_order.empty()) tree->computePartitionOrder();
         #ifdef _OPENMP
-        #pragma omp parallel for reduction(+: cur_lh) schedule(dynamic) if(ntrees >= tree->params->num_threads)
+        #pragma omp parallel for reduction(+: cur_lh) schedule(dynamic) if(tree->num_threads > 1)
         #endif
     	for (int partid = 0; partid < ntrees; partid++) {
             int part = tree->part_order[partid];
@@ -229,15 +229,28 @@ double PartitionModelPlen::optimizeGeneRate(double gradient_epsilon)
     double score = 0.0;
     double nsites = tree->getAlnNSite();
 
+    DoubleVector brlen;
+    brlen.resize(tree->branchNum);
+    tree->getBranchLengths(brlen);
+    double max_brlen = 0.0;
+    for (i = 0; i < brlen.size(); i++)
+        if (brlen[i] > max_brlen)
+            max_brlen = brlen[i];
+
     if (tree->part_order.empty()) tree->computePartitionOrder();
 
     #ifdef _OPENMP
-    #pragma omp parallel for reduction(+: score) private(i) schedule(dynamic) if(tree->size() >= tree->params->num_threads)
+    #pragma omp parallel for reduction(+: score) private(i) schedule(dynamic) if(tree->num_threads > 1)
     #endif    
     for (int j = 0; j < tree->size(); j++) {
         int i = tree->part_order[j];
+        double min_scaling = 1.0/tree->at(i)->getAlnNSite();
         double max_scaling = nsites / tree->at(i)->getAlnNSite();
-        tree->part_info[i].cur_score = tree->at(i)->optimizeTreeLengthScaling(1.0/tree->at(i)->getAlnNSite(), tree->part_info[i].part_rate, max_scaling, gradient_epsilon);
+        if (max_scaling < tree->part_info[i].part_rate)
+            max_scaling = tree->part_info[i].part_rate;
+        if (min_scaling > tree->part_info[i].part_rate)
+            min_scaling = tree->part_info[i].part_rate;
+        tree->part_info[i].cur_score = tree->at(i)->optimizeTreeLengthScaling(min_scaling, tree->part_info[i].part_rate, max_scaling, gradient_epsilon);
         score += tree->part_info[i].cur_score;
     }
     // now normalize the rates
@@ -251,6 +264,12 @@ double PartitionModelPlen::optimizeGeneRate(double gradient_epsilon)
             nsite += tree->at(i)->aln->getNSite();
     }
     sum /= nsite;
+    
+    if (sum > tree->params->max_branch_length / max_brlen) {
+        cerr << endl << "ERROR: Too high (saturated) partition rates of the proportion partition model!"
+            << endl <<  "Please switch to the edge-equal partition model via -q option instead of -spp" << endl << endl;
+        exit(EXIT_FAILURE);
+    }
     tree->scaleLength(sum);
     sum = 1.0/sum;
     for (i = 0; i < tree->size(); i++)
@@ -324,6 +343,8 @@ void PhyloSuperTreePlen::deleteAllPartialLh() {
 		(*it)->_pattern_lh = NULL;
 		(*it)->_pattern_lh_cat = NULL;
 		(*it)->theta_all = NULL;
+        (*it)->buffer_scale_all = NULL;
+        (*it)->buffer_partial_lh = NULL;
 		(*it)->ptn_freq = NULL;
 		(*it)->ptn_freq_computed = false;
 		(*it)->ptn_invar = NULL;
@@ -343,6 +364,8 @@ PhyloSuperTreePlen::~PhyloSuperTreePlen()
 		(*it)->_pattern_lh = NULL;
 		(*it)->_pattern_lh_cat = NULL;
 		(*it)->theta_all = NULL;
+        (*it)->buffer_scale_all = NULL;
+        (*it)->buffer_partial_lh = NULL;
 		(*it)->ptn_freq = NULL;
 		(*it)->ptn_freq_computed = false;
 		(*it)->ptn_invar = NULL;
@@ -453,7 +476,7 @@ void PhyloSuperTreePlen::optimizeOneBranch(PhyloNode *node1, PhyloNode *node2, b
     if (part_order.empty()) computePartitionOrder();
 	// bug fix: assign cur_score into part_info
     #ifdef _OPENMP
-    #pragma omp parallel for private(part) schedule(dynamic) if(size() >= params->num_threads)
+    #pragma omp parallel for private(part) schedule(dynamic) if(num_threads > 1)
     #endif    
 	for (int partid = 0; partid < size(); partid++) {
         part = part_order_by_nptn[partid];
@@ -493,7 +516,7 @@ double PhyloSuperTreePlen::computeFunction(double value) {
 
     if (part_order.empty()) computePartitionOrder();
     #ifdef _OPENMP
-    #pragma omp parallel for reduction(+: tree_lh) schedule(dynamic) if(ntrees >= params->num_threads)
+    #pragma omp parallel for reduction(+: tree_lh) schedule(dynamic) if(num_threads > 1)
     #endif    
 	for (int partid = 0; partid < ntrees; partid++) {
             int part = part_order_by_nptn[partid];
@@ -551,7 +574,7 @@ void PhyloSuperTreePlen::computeFuncDerv(double value, double &df_ret, double &d
 
     if (part_order.empty()) computePartitionOrder();
     #ifdef _OPENMP
-    #pragma omp parallel for reduction(+: df, ddf) schedule(dynamic) if(ntrees >= params->num_threads)
+    #pragma omp parallel for reduction(+: df, ddf) schedule(dynamic) if(num_threads > 1)
     #endif    
 	for (int partid = 0; partid < ntrees; partid++) {
         int part = part_order_by_nptn[partid];
@@ -567,6 +590,7 @@ void PhyloSuperTreePlen::computeFuncDerv(double value, double &df_ret, double &d
 				if(nei1_part->length<-1e-4){
 					cout<<"lambda = "<<lambda<<endl;
 					cout<<"NEGATIVE BRANCH len = "<<nei1_part->length<<endl<<" rate = "<<part_info[part].part_rate<<endl;
+                    assert(0);
 					outError("shit!!   ",__func__);
 				}
 				at(part)->computeLikelihoodDerv(nei2_part,(PhyloNode*)nei1_part->node, df_aux, ddf_aux);
@@ -628,10 +652,20 @@ NNIMove PhyloSuperTreePlen::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2
     // Initialize node1 and node2 in nniMoves
 	nniMoves[0].node1 = nniMoves[1].node1 = node1;
 	nniMoves[0].node2 = nniMoves[1].node2 = node2;
+    nniMoves[0].newloglh = nniMoves[1].newloglh = -DBL_MAX;
+
+    // check for compatibility with constraint
+    // check for consistency with constraint tree
+    for (cnt = 0; cnt < 2; cnt++) {
+        if (!constraintTree.isCompatible(nniMoves[cnt])) {
+            nniMoves[cnt].node1 = nniMoves[cnt].node2 = NULL;
+        }
+    }
 
 	//--------------------------------------------------------------------------
 
-	this->swapNNIBranch(0.0, node1, node2, &nni_param, nniMoves);
+    if (nniMoves[0].node1 || nniMoves[1].node1)
+        this->swapNNIBranch(0.0, node1, node2, &nni_param, nniMoves);
 
 
 	 // restore curScore
@@ -651,8 +685,8 @@ NNIMove PhyloSuperTreePlen::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2
 	return myMove;
 }
 
-void PhyloSuperTreePlen::doNNIs(int nni2apply, bool changeBran) {
-	IQTree::doNNIs(nni2apply, changeBran);
+void PhyloSuperTreePlen::doNNIs(vector<NNIMove> &compatibleNNIs, bool changeBran) {
+	IQTree::doNNIs(compatibleNNIs, changeBran);
 	mapBranchLen();
 	//clearAllPartialLH();
 }
@@ -954,6 +988,8 @@ double PhyloSuperTreePlen::swapNNIBranch(double cur_score, PhyloNode *node1, Phy
 			//evalNNIs++;
 			//part_info[part].evalNNIs++;
 
+            int mem_id = 0;
+
 			// one branch optimization ------------------------------------------------------------------
 			for(id = 0; id < 2; id++){
 				/*
@@ -969,8 +1005,11 @@ double PhyloSuperTreePlen::swapNNIBranch(double cur_score, PhyloNode *node1, Phy
 
 				// Create a new PhyloNeighbor, with new partial lhs, scale number and set the branch id as before
 				*sub_saved_it[part*6 + id] = new PhyloNeighbor(nei_link, saved_nei[id]->link_neighbors[part]->length);
-				((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->partial_lh = nni_partial_lh + (id*total_block_size + lh_addr);
-				((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->scale_num = nni_scale_num + (id*total_scale_block_size + scale_addr);
+                if (saved_nei[id]->link_neighbors[part]->partial_lh) {
+                    ((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->partial_lh = nni_partial_lh + (mem_id*total_block_size + lh_addr);
+                    ((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->scale_num = nni_scale_num + (mem_id*total_scale_block_size + scale_addr);
+                    mem_id++;
+                }
 				(*sub_saved_it[part*6 + id])->id = saved_nei[id]->link_neighbors[part]->id;
 
 				// update link_neighbor[part]: for New SuperNeighbor we set the corresponding new PhyloNeighbor on partition part
@@ -984,8 +1023,11 @@ double PhyloSuperTreePlen::swapNNIBranch(double cur_score, PhyloNode *node1, Phy
 					node_link = ((SuperNeighbor*)(*node_nei_it[id-2]))->link_neighbors[part]->node;
 					sub_saved_it[part*6 + id] = node_link->findNeighborIt(nei_link);
 					*sub_saved_it[part*6 + id] = new PhyloNeighbor(nei_link, saved_nei[id]->link_neighbors[part]->length);
-					((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->partial_lh = nni_partial_lh + (id*total_block_size + lh_addr);
-					((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->scale_num = nni_scale_num + (id*total_scale_block_size + scale_addr);
+                    if (saved_nei[id]->link_neighbors[part]->partial_lh) {
+                        ((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->partial_lh = nni_partial_lh + (mem_id*total_block_size + lh_addr);
+                        ((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->scale_num = nni_scale_num + (mem_id*total_scale_block_size + scale_addr);
+                        mem_id++;
+                    }
 					(*sub_saved_it[part*6 + id])->id = saved_nei[id]->link_neighbors[part]->id;
 
 					// update link_neighbor[part]
@@ -993,8 +1035,11 @@ double PhyloSuperTreePlen::swapNNIBranch(double cur_score, PhyloNode *node1, Phy
 				}
 			}
 
+            assert(mem_id == 2);
+
 		} else if(is_nni[part]==NNI_ONE_EPSILON){
 
+            int mem_id = 0;
 			// Make sure to update all the necessary link_neighbors and take care of branch lengths
 			// (increase/decrease by central branch where necessary).
 
@@ -1028,8 +1073,11 @@ double PhyloSuperTreePlen::swapNNIBranch(double cur_score, PhyloNode *node1, Phy
 					sub_saved_branch[6*part + id] = nei->link_neighbors[part]->length;
 
 					*sub_saved_it[part*6 + id] = new PhyloNeighbor(nei_link, nei->link_neighbors[part]->length);
-					((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->partial_lh = nni_partial_lh + (id*total_block_size + lh_addr);
-					((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->scale_num = nni_scale_num + (id*total_scale_block_size + scale_addr);
+                    if (nei->link_neighbors[part]->partial_lh) {
+                        ((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->partial_lh = nni_partial_lh + (mem_id*total_block_size + lh_addr);
+                        ((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->scale_num = nni_scale_num + (mem_id*total_scale_block_size + scale_addr);
+                        mem_id++;
+                    }
 					(*sub_saved_it[part*6 + id])->id = nei->link_neighbors[part]->id;
 
 					// If nni5 we update the link neighbors already here, otherwise
@@ -1043,6 +1091,7 @@ double PhyloSuperTreePlen::swapNNIBranch(double cur_score, PhyloNode *node1, Phy
 					id_eps[part] = id;
 				}
 			}
+            assert(mem_id == 1);
 		}else if(is_nni[part]==NNI_THREE_EPSILON && params->nni5){
 			// you fill out link neighbors vector for newly allocated SuperNeighbors
 			for(id = 2; id < 6; id++){
@@ -1093,7 +1142,8 @@ double PhyloSuperTreePlen::swapNNIBranch(double cur_score, PhyloNode *node1, Phy
 	 *	- restore if necessary.
 	 *===========================================================================================*/
 	int cnt;
-	for (cnt = 0; cnt < 2; cnt++) {
+	for (cnt = 0; cnt < 2; cnt++) if (nniMoves[cnt].node1) // only if nniMove satisfy constraint 
+    {
 		//cout<<"NNI Loop-----------------------------NNI."<<cnt<<endl;
 
     	NeighborVec::iterator node1_it = nniMoves[cnt].node1Nei_it;
@@ -1129,6 +1179,10 @@ double PhyloSuperTreePlen::swapNNIBranch(double cur_score, PhyloNode *node1, Phy
 				//cout<<part<<"- NO_EPS: do NNI swap"<<endl;
 				//allNNIcases_computed[0] += 1;
 
+                // reorient partial_lh before swap
+                reorientPartialLh((PhyloNeighbor*)node1_link[part]->findNeighbor(node2_link[part]), node1_link[part]);
+                reorientPartialLh((PhyloNeighbor*)node2_link[part]->findNeighbor(node1_link[part]), node2_link[part]);
+
 				// Do NNI swap on partition
 				node1_link[part]->updateNeighbor(node1_link_it[part], node2_link_nei[part]);
 				node2_link_nei[part]->node->updateNeighbor(node2_link[part], node1_link[part]);
@@ -1420,6 +1474,11 @@ double PhyloSuperTreePlen::swapNNIBranch(double cur_score, PhyloNode *node1, Phy
 		for(part = 0; part < ntrees; part++){
 
 			if(is_nni[part]==NNI_NO_EPSILON){
+
+                // reorient partial_lh before swap
+                reorientPartialLh((PhyloNeighbor*)node1_link[part]->findNeighbor(node2_link[part]), node1_link[part]);
+                reorientPartialLh((PhyloNeighbor*)node2_link[part]->findNeighbor(node1_link[part]), node2_link[part]);
+
 				node1_link[part]->updateNeighbor(node1_link_it[part], node1_link_nei[part]);
 				node1_link_nei[part]->node->updateNeighbor(node2_link[part], node1_link[part]);
 				node2_link[part]->updateNeighbor(node2_link_it[part], node2_link_nei[part]);
@@ -1752,10 +1811,17 @@ void PhyloSuperTreePlen::initializeAllPartialLh() {
 	block_size.resize(ntrees);
 	scale_block_size.resize(ntrees);
 
-	vector<uint64_t> mem_size, lh_cat_size;
+	vector<uint64_t> mem_size, lh_cat_size, buffer_size;
 	mem_size.resize(ntrees);
 	lh_cat_size.resize(ntrees);
-	uint64_t total_mem_size = 0, total_block_size = 0, total_lh_cat_size = 0;
+    buffer_size.resize(ntrees);
+
+	uint64_t
+        total_mem_size = 0,
+        total_block_size = 0,
+        total_scale_block_size = 0,
+        total_lh_cat_size = 0,
+        total_buffer_size = 0;
 
 	if (part_order.empty())
 		computePartitionOrder();
@@ -1763,20 +1829,21 @@ void PhyloSuperTreePlen::initializeAllPartialLh() {
 	for (partid = 0; partid < ntrees; partid++) {
 		part = part_order[partid];
         it = begin() + part;
-		size_t nptn = (*it)->getAlnNPattern() + (*it)->aln->num_states; // extra #numStates for ascertainment bias correction
-		if (instruction_set >= 7)
-			mem_size[part] = ((nptn +3)/4)*4;
-		else
-			mem_size[part] = ((nptn % 2) == 0) ? nptn : (nptn + 1);
-		scale_block_size[part] = nptn;
-		block_size[part] = mem_size[part] * (*it)->aln->num_states * (*it)->getRate()->getNRate() *
+        // extra #numStates for ascertainment bias correction
+		mem_size[part] = get_safe_upper_limit((*it)->getAlnNPattern()) + get_safe_upper_limit((*it)->aln->num_states);
+        size_t mem_cat_size = mem_size[part] * (*it)->getRate()->getNRate() *
 				(((*it)->model_factory->fused_mix_rate)? 1 : (*it)->getModel()->getNMixtures());
 
+		block_size[part] = mem_cat_size * (*it)->aln->num_states;
+		scale_block_size[part] = mem_cat_size;
+
 		lh_cat_size[part] = mem_size[part] * (*it)->getRate()->getNDiscreteRate() *
 				(((*it)->model_factory->fused_mix_rate)? 1 : (*it)->getModel()->getNMixtures());
 		total_mem_size += mem_size[part];
 		total_block_size += block_size[part];
+        total_scale_block_size += scale_block_size[part];
 		total_lh_cat_size += lh_cat_size[part];
+        total_buffer_size += (buffer_size[part] = (*it)->getBufferPartialLhSize());
 	}
 
     if (!_pattern_lh)
@@ -1787,7 +1854,13 @@ void PhyloSuperTreePlen::initializeAllPartialLh() {
     at(part_order[0])->_pattern_lh_cat = _pattern_lh_cat;
     if (!theta_all)
         theta_all = aligned_alloc<double>(total_block_size);
+    if (!buffer_scale_all)
+        buffer_scale_all = aligned_alloc<double>(total_mem_size);
+    if (!buffer_partial_lh)
+        buffer_partial_lh = aligned_alloc<double>(total_buffer_size);
     at(part_order[0])->theta_all = theta_all;
+    at(part_order[0])->buffer_scale_all = buffer_scale_all;
+    at(part_order[0])->buffer_partial_lh = buffer_partial_lh;
     if (!ptn_freq) {
         ptn_freq = aligned_alloc<double>(total_mem_size);
         ptn_freq_computed = false;
@@ -1798,14 +1871,15 @@ void PhyloSuperTreePlen::initializeAllPartialLh() {
         ptn_invar = aligned_alloc<double>(total_mem_size);
     at(part_order[0])->ptn_invar = ptn_invar;
 
-    size_t IT_NUM = (params->nni5) ? 6 : 2;
+//    size_t IT_NUM = (params->nni5) ? 6 : 2;
+    size_t IT_NUM = 2;
     if (!nni_partial_lh) {
         nni_partial_lh = aligned_alloc<double>(IT_NUM*total_block_size);
     }
     at(part_order[0])->nni_partial_lh = nni_partial_lh;
     
     if (!nni_scale_num) {
-        nni_scale_num = aligned_alloc<UBYTE>(IT_NUM*total_mem_size);
+        nni_scale_num = aligned_alloc<UBYTE>(IT_NUM*total_scale_block_size);
     }
     at(part_order[0])->nni_scale_num = nni_scale_num;
 
@@ -1816,11 +1890,13 @@ void PhyloSuperTreePlen::initializeAllPartialLh() {
 		(*it)->_pattern_lh = (*prev_it)->_pattern_lh + mem_size[part];
 		(*it)->_pattern_lh_cat = (*prev_it)->_pattern_lh_cat + lh_cat_size[part];
 		(*it)->theta_all = (*prev_it)->theta_all + block_size[part];
+        (*it)->buffer_scale_all = (*prev_it)->buffer_scale_all + mem_size[part];
+        (*it)->buffer_partial_lh = (*prev_it)->buffer_partial_lh + buffer_size[part];
 		(*it)->ptn_freq = (*prev_it)->ptn_freq + mem_size[part];
 		(*it)->ptn_freq_computed = false;
 		(*it)->ptn_invar = (*prev_it)->ptn_invar + mem_size[part];
         (*it)->nni_partial_lh = (*prev_it)->nni_partial_lh + IT_NUM*block_size[part];
-        (*it)->nni_scale_num = (*prev_it)->nni_scale_num + IT_NUM*mem_size[part];
+        (*it)->nni_scale_num = (*prev_it)->nni_scale_num + IT_NUM*scale_block_size[part];
 	}
 
 	// compute total memory for all partitions
@@ -1873,6 +1949,32 @@ void PhyloSuperTreePlen::initializeAllPartialLh() {
         tip_partial_lh_size = ((tip_partial_lh_size+3)/4)*4;
         lh_addr += tip_partial_lh_size;
     }
+
+    // 2016-09-29: redirect partial_lh when root does not occur in partition tree
+    SuperNeighbor *root_nei = (SuperNeighbor*)root->neighbors[0];
+    for (it = begin(), part = 0; it != end(); it++, part++) {
+        if (root_nei->link_neighbors[part])
+            continue;
+        NodeVector nodes;
+        (*it)->getInternalNodes(nodes);
+        for (NodeVector::iterator nit = nodes.begin(); nit != nodes.end(); nit++) {
+            bool has_partial_lh = false;
+            FOR_NEIGHBOR_IT(*nit, NULL, neiit)
+                if ( ((PhyloNeighbor*)(*neiit)->node->findNeighbor(*nit))->partial_lh) {
+                    has_partial_lh = true;
+                    break;
+                }
+            if (has_partial_lh)
+                continue;
+            // add partial_lh
+            PhyloNeighbor *back_nei = (PhyloNeighbor*)(*nit)->neighbors[0]->node->findNeighbor(*nit);
+            back_nei->partial_lh = lh_addr;
+            back_nei->scale_num = scale_addr;
+            lh_addr = lh_addr + block_size[part];
+            scale_addr = scale_addr + scale_block_size[part];
+        }
+    }
+
 }
 
 void PhyloSuperTreePlen::initializeAllPartialLh(double* &lh_addr, UBYTE* &scale_addr, UINT* &pars_addr, PhyloNode *node, PhyloNode *dad) {
@@ -1889,7 +1991,7 @@ void PhyloSuperTreePlen::initializeAllPartialLh(double* &lh_addr, UBYTE* &scale_
         	PhyloNeighbor *nei_part_back = nei_back->link_neighbors[part];
             
 
-            if (params->lh_mem_save == LM_PER_NODE && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
+            if (params->lh_mem_save == LM_PER_NODE) {
                 if (!nei_part_back->node->isLeaf()) {
                     if (!nei_part_back->partial_lh) {
                         nei_part_back->partial_lh = lh_addr;
@@ -1904,7 +2006,7 @@ void PhyloSuperTreePlen::initializeAllPartialLh(double* &lh_addr, UBYTE* &scale_
 //                nei_part->partial_lh = NULL;
 //                nei_part->scale_num = NULL;
             } else {
-                if (nei_part->node->isLeaf() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
+                if (nei_part->node->isLeaf()) {
                     nei_part->partial_lh = NULL; // do not allocate memory for tip, use tip_partial_lh instead
                     nei_part->scale_num = NULL;
                 } else if (!nei_part->partial_lh) {
@@ -1917,7 +2019,7 @@ void PhyloSuperTreePlen::initializeAllPartialLh(double* &lh_addr, UBYTE* &scale_
     //			pars_addr += partial_pars_entries[part];
 
                 nei_part = nei_back->link_neighbors[part];
-                if (nei_part->node->isLeaf() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
+                if (nei_part->node->isLeaf()) {
                     nei_part->partial_lh = NULL; // do not allocate memory for tip, use tip_partial_lh instead
                     nei_part->scale_num = NULL;
                 } else if (!nei_part->partial_lh) {
diff --git a/phylosupertreeplen.h b/phylosupertreeplen.h
index 9c844d3..88dd177 100644
--- a/phylosupertreeplen.h
+++ b/phylosupertreeplen.h
@@ -313,12 +313,13 @@ public:
             @param move the single NNI
      */
     virtual void doNNI(NNIMove &move, bool clearLH = true);
-    /**
-            apply nni2apply NNIs from the non-conflicting NNI list
-            @param nni2apply number of NNIs to apply from the list
+
+	/**
+            apply  NNIs from the non-conflicting NNI list
+            @param compatibleNNIs vector of all compatible NNIs
             @param changeBran whether or not the computed branch lengths should be applied
      */
-    virtual void doNNIs(int nni2apply, bool changeBran = true);
+    virtual void doNNIs(vector<NNIMove> &compatibleNNIs, bool changeBran = true);
 
     /**
      *   Apply 5 new branch lengths stored in the NNI move
diff --git a/phylotesting.cpp b/phylotesting.cpp
index 0a318cd..2ddc637 100644
--- a/phylotesting.cpp
+++ b/phylotesting.cpp
@@ -35,7 +35,7 @@
 
 #include "phyloanalysis.h"
 #include "gsl/mygsl.h"
-#include "vectorclass/vectorclass.h"
+//#include "vectorclass/vectorclass.h"
 
 
 /******* Binary model set ******/
@@ -241,6 +241,59 @@ void printSiteLh(const char*filename, PhyloTree *tree, double *ptn_lh,
 		delete[] pattern_lh;
 }
 
+void printPartitionLh(const char*filename, PhyloTree *tree, double *ptn_lh,
+		bool append, const char *linename) {
+
+    assert(tree->isSuperTree());
+    PhyloSuperTree *stree = (PhyloSuperTree*)tree;
+	int i;
+	double *pattern_lh;
+	if (!ptn_lh) {
+		pattern_lh = new double[tree->getAlnNPattern()];
+		tree->computePatternLikelihood(pattern_lh);
+	} else
+		pattern_lh = ptn_lh;
+
+    double partition_lh[stree->size()];
+    int part;
+    double *pattern_lh_ptr = pattern_lh;
+    for (part = 0; part < stree->size(); part++) {
+        size_t nptn = stree->at(part)->getAlnNPattern();
+        partition_lh[part] = 0.0;
+        for (i = 0; i < nptn; i++)
+            partition_lh[part] += pattern_lh_ptr[i] * stree->at(part)->ptn_freq[i];
+        pattern_lh_ptr += nptn;
+    }
+
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		if (append) {
+			out.open(filename, ios::out | ios::app);
+		} else {
+			out.open(filename);
+			out << 1 << " " << stree->size() << endl;
+		}
+		if (!linename)
+			out << "Part_Lh   ";
+		else {
+			out.width(10);
+			out << left << linename;
+		}
+		for (i = 0; i < stree->size(); i++)
+			out << " " << partition_lh[i];
+		out << endl;
+		out.close();
+		if (!append)
+			cout << "Partition log-likelihoods printed to " << filename << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+
+	if (!ptn_lh)
+		delete[] pattern_lh;
+}
+
 void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) {
 
     if (tree->isSuperTree()) {
@@ -267,7 +320,7 @@ void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl)
 	double *pattern_lh, *pattern_lh_cat;
 	int i;
 	pattern_lh = new double[tree->getAlnNPattern()];
-	pattern_lh_cat = new double[tree->getAlnNPattern()*ncat];
+	pattern_lh_cat = new double[((size_t)tree->getAlnNPattern())*ncat];
 	tree->computePatternLikelihood(pattern_lh, NULL, pattern_lh_cat, wsl);
 
     
@@ -339,6 +392,120 @@ void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl)
 
 }
 
+void printAncestralSequences(const char *out_prefix, PhyloTree *tree, AncestralSeqType ast) {
+
+    int i, j, nsites = tree->getAlnNSite(), nstates = tree->aln->num_states, nptn = tree->getAlnNPattern();
+
+    int *joint_ancestral = NULL;
+    
+    if (tree->params->print_ancestral_sequence == AST_JOINT) {
+        joint_ancestral = new int[nptn*tree->leafNum];    
+        tree->computeJointAncestralSequences(joint_ancestral);
+    }
+
+    string filename = (string)out_prefix + ".ancestralprob";
+    string filenameseq = (string)out_prefix + ".ancestralseq";
+
+    try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(filename.c_str());
+
+		ofstream outseq;
+		outseq.exceptions(ios::failbit | ios::badbit);
+		outseq.open(filenameseq.c_str());
+
+        NodeVector nodes;
+        tree->getInternalNodes(nodes);
+		IntVector pattern_index;
+		tree->aln->getSitePatternIndex(pattern_index);
+
+        double *marginal_ancestral_prob = new double[nptn * tree->getModel()->num_states];
+        int *marginal_ancestral_seq = new int[nptn];
+
+        out << "Node\tSite\tMargin";
+        for (i = 0; i < nstates; i++)
+            out << "\tp_" << tree->aln->convertStateBackStr(i);
+        out << endl;
+        
+        if (tree->params->print_ancestral_sequence == AST_JOINT)
+            outseq << 2*(tree->nodeNum-tree->leafNum) << " " << nsites << endl;
+        else
+            outseq << (tree->nodeNum-tree->leafNum) << " " << nsites << endl;
+        
+        int name_width = max(tree->aln->getMaxSeqNameLength(),6)+10;
+
+        for (NodeVector::iterator it = nodes.begin(); it != nodes.end(); it++) {
+            PhyloNode *node = (PhyloNode*)(*it);
+            PhyloNode *dad = (PhyloNode*)node->neighbors[0]->node;
+            tree->computeMarginalAncestralProbability((PhyloNeighbor*)dad->findNeighbor(node), dad, marginal_ancestral_prob);
+            
+            int *joint_ancestral_node = joint_ancestral + (node->id - tree->leafNum)*nptn;
+            
+            // compute state with highest probability
+            for (i = 0; i < nptn; i++) {
+                double *prob = marginal_ancestral_prob + (i*nstates);
+                int state_best = 0;
+                for (j = 1; j < nstates; j++)
+                    if (prob[j] > prob[state_best])
+                        state_best = j;
+                //if (fabs(prob[state_best]-flat_prob) < 1e-5)
+                if (prob[state_best] < tree->params->min_ancestral_prob)
+                    state_best = STATE_INVALID;
+                marginal_ancestral_seq[i] = state_best;
+            }
+            
+            // set node name if neccessary
+            if (node->name.empty() || !isalpha(node->name[0])) {
+                node->name = "Node" + convertIntToString(node->id-tree->leafNum+1);
+            }
+            
+            // print ancestral state probabilities
+            for (i = 0; i < nsites; i++) {
+                int ptn = pattern_index[i];
+                out << node->name << "\t" << i+1 << "\t";
+                if (tree->params->print_ancestral_sequence == AST_JOINT)
+                    out << tree->aln->convertStateBackStr(joint_ancestral_node[ptn]) << "\t";
+                out << tree->aln->convertStateBackStr(marginal_ancestral_seq[ptn]);
+                for (j = 0; j < nstates; j++) {
+                    out << "\t" << marginal_ancestral_prob[ptn*nstates+j];
+                }
+                out << endl;
+            }
+            
+            // print ancestral sequences
+            outseq.width(name_width);
+            outseq << left << (node->name+"_marginal") << " ";
+            for (i = 0; i < nsites; i++) 
+                outseq << tree->aln->convertStateBackStr(marginal_ancestral_seq[pattern_index[i]]);
+            outseq << endl;
+            
+            if (tree->params->print_ancestral_sequence == AST_JOINT) {
+                outseq.width(name_width);
+                outseq << left << (node->name+"_joint") << " ";
+                for (i = 0; i < nsites; i++) 
+                    outseq << tree->aln->convertStateBackStr(joint_ancestral_node[pattern_index[i]]);
+                outseq << endl;
+            }
+        }
+
+        delete[] marginal_ancestral_seq;
+        delete[] marginal_ancestral_prob;
+        
+		out.close();
+        outseq.close();
+		cout << "Ancestral state probabilities printed to " << filename << endl;
+		cout << "Ancestral sequences printed to " << filenameseq << endl;
+        
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+    
+    if (joint_ancestral)
+        delete[] joint_ancestral;
+
+}
+
 void printSiteProbCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) {
 
     if (wsl == WSL_NONE || wsl == WSL_SITE)
@@ -357,7 +524,7 @@ void printSiteProbCategory(const char*filename, PhyloTree *tree, SiteLoglType ws
         }
     }
 	size_t cat, ncat = tree->getNumLhCat(wsl);
-    double *ptn_prob_cat = new double[tree->getAlnNPattern()*ncat];
+    double *ptn_prob_cat = new double[((size_t)tree->getAlnNPattern())*ncat];
 	tree->computePatternProbabilityCategory(ptn_prob_cat, wsl);
     
 	try {
@@ -415,7 +582,7 @@ void printSiteStateFreq(const char*filename, PhyloTree *tree, double *state_freq
     if (state_freqs) {
     	ptn_state_freq = state_freqs;
     } else {
-    	ptn_state_freq = new double[tree->getAlnNPattern() * nstates];
+    	ptn_state_freq = new double[((size_t)tree->getAlnNPattern()) * nstates];
         tree->computePatternStateFreq(ptn_state_freq);
     }
 
@@ -650,8 +817,8 @@ int getModelList(Params &params, Alignment *aln, StrVector &models, bool separat
         }
     }
 
-    bool with_new = params.model_name.find("NEW") != string::npos;
-    bool with_asc = params.model_name.find("ASC") != string::npos;
+	bool with_new = params.model_name.find("NEW") != string::npos;
+	bool with_asc = params.model_name.find("ASC") != string::npos;
 
 //	if (seq_type == SEQ_CODON) {
 //		for (i = 0; i < noptions; i++)
@@ -849,6 +1016,7 @@ void mergePartitions(PhyloSuperTree* super_tree, vector<IntVector> &gene_sets, S
 		part_info.push_back(info);
 		Alignment *aln = super_aln->concatenateAlignments(*it);
 		PhyloTree *tree = super_tree->extractSubtree(*it);
+        tree->setParams(super_tree->params);
 		tree->setAlignment(aln);
 		tree_vec.push_back(tree);
 	}
@@ -861,6 +1029,7 @@ void mergePartitions(PhyloSuperTree* super_tree, vector<IntVector> &gene_sets, S
 
 	delete super_tree->aln;
 	super_tree->aln = new SuperAlignment(super_tree);
+    super_tree->setAlignment(super_tree->aln);
 }
 
 void printModelFile(ostream &fmodel, Params &params, PhyloTree *tree, ModelInfo &info, string &set_name) {
@@ -916,7 +1085,7 @@ void printModelFile(ostream &fmodel, Params &params, PhyloTree *tree, ModelInfo
  * @param model_info (IN/OUT) all model information
  * @return total number of parameters
  */
-void testPartitionModel(Params &params, PhyloSuperTree* in_tree, vector<ModelInfo> &model_info, ostream &fmodel, ModelsBlock *models_block ) {
+void testPartitionModel(Params &params, PhyloSuperTree* in_tree, vector<ModelInfo> &model_info, ostream &fmodel, ModelsBlock *models_block, int num_threads) {
 //    params.print_partition_info = true;
 //    params.print_conaln = true;
 	int i = 0;
@@ -927,15 +1096,25 @@ void testPartitionModel(Params &params, PhyloSuperTree* in_tree, vector<ModelInf
 	double lhsum = 0.0;
 	int dfsum = 0;
 	int ssize = in_tree->getAlnNSite();
-	int num_model = 0;
-    int total_num_model = in_tree->size();
+	int64_t num_model = 0;
+    int64_t total_num_model = in_tree->size();
 	if (params.model_name.find("LINK") != string::npos || params.model_name.find("MERGE") != string::npos) {
         double p = params.partfinder_rcluster/100.0;
         total_num_model += round(in_tree->size()*(in_tree->size()-1)*p/2);
         for (i = in_tree->size()-2; i > 0; i--)
             total_num_model += max(round(i*p), 1.0);
     }
-    
+
+
+#ifdef _OPENMP
+    if (num_threads <= 0) {
+        // partition selection scales well with many cores
+        num_threads = min((int64_t)countPhysicalCPUCores(), total_num_model);
+        omp_set_num_threads(num_threads);
+        cout << "NUMBER OF THREADS FOR PARTITION FINDING: " << num_threads << endl;
+    }
+#endif
+
     double start_time = getRealTime();
     
 	cout << "Selecting individual models for " << in_tree->size() << " charsets using " << criterionName(params.model_test_criterion) << "..." << endl;
@@ -957,7 +1136,7 @@ void testPartitionModel(Params &params, PhyloSuperTree* in_tree, vector<ModelInf
         dist[i] = -((double)this_aln->getNSeq())*this_aln->getNPattern()*this_aln->num_states;
     }
     
-    if (params.num_threads > 1) 
+    if (num_threads > 1)
     {
         quicksort(dist, 0, in_tree->size()-1, distID);
         if (verbose_mode >= VB_MED) {
@@ -981,7 +1160,7 @@ void testPartitionModel(Params &params, PhyloSuperTree* in_tree, vector<ModelInf
         stringstream this_fmodel;
 		// do the computation
 //#ifdef _OPENMP
-		string model = testModel(params, this_tree, part_model_info, this_fmodel, models_block, in_tree->part_info[i].name);
+		string model = testModel(params, this_tree, part_model_info, this_fmodel, models_block, 1, in_tree->part_info[i].name);
 //#else
 //		string model = testModel(params, this_tree, part_model_info, fmodel, in_tree->part_info[i].name);
 //#endif
@@ -1078,7 +1257,7 @@ void testPartitionModel(Params &params, PhyloSuperTree* in_tree, vector<ModelInf
             this_aln = in_tree->at(distID[i] & ((1<<16)-1))->aln;
             dist[i] -= ((double)this_aln->getNSeq())*this_aln->getNPattern()*this_aln->num_states;
         }
-        if (params.num_threads > 1 && num_pairs >= 1)
+        if (num_threads > 1 && num_pairs >= 1)
             quicksort(dist, 0, num_pairs-1, distID);
 
 #ifdef _OPENMP
@@ -1127,7 +1306,7 @@ void testPartitionModel(Params &params, PhyloSuperTree* in_tree, vector<ModelInf
                     tree->setCheckpoint(new Checkpoint());
                 }
 //#ifdef _OPENMP
-                model = testModel(params, tree, part_model_info, this_fmodel, models_block, set_name);
+                model = testModel(params, tree, part_model_info, this_fmodel, models_block, 1, set_name);
 //#else
 //                model = testModel(params, tree, part_model_info, fmodel, set_name);
 //#endif
@@ -1160,7 +1339,7 @@ void testPartitionModel(Params &params, PhyloSuperTree* in_tree, vector<ModelInf
 					cout.width(11);
 					cout << score << " " << set_name;
                     if (num_model >= 10) {
-                        double remain_time = max(total_num_model-num_model, 0)*(getRealTime()-start_time)/num_model;
+                        double remain_time = max(total_num_model-num_model, (int64_t)0)*(getRealTime()-start_time)/num_model;
                         cout << "\t" << convert_time(getRealTime()-start_time) << " (" 
                             << convert_time(remain_time) << " left)";
                     }
@@ -1233,7 +1412,8 @@ void testPartitionModel(Params &params, PhyloSuperTree* in_tree, vector<ModelInf
     
     delete [] distID;
     delete [] dist;
-	mergePartitions(in_tree, gene_sets, model_names);
+    if (gene_sets.size() < in_tree->size())
+        mergePartitions(in_tree, gene_sets, model_names);
 	in_tree->printBestPartition((string(params.out_prefix) + ".best_scheme.nex").c_str());
 	in_tree->printBestPartitionRaxml((string(params.out_prefix) + ".best_scheme").c_str());
 }
@@ -1253,7 +1433,7 @@ bool isMixtureModel(ModelsBlock *models_block, string &model_str) {
 }
 
 string testModel(Params &params, PhyloTree* in_tree, vector<ModelInfo> &model_info, ostream &fmodel, ModelsBlock *models_block,
-    string set_name, bool print_mem_usage) 
+    int num_threads, string set_name, bool print_mem_usage)
 {
 	SeqType seq_type = in_tree->aln->seq_type;
 	if (in_tree->isSuperTree())
@@ -1283,14 +1463,13 @@ string testModel(Params &params, PhyloTree* in_tree, vector<ModelInfo> &model_in
 #endif
     }
 
-
 	string best_model = "";
 	/* first check the model file */
 
 	if (in_tree->isSuperTree()) {
 		// select model for each partition
 		PhyloSuperTree *stree = (PhyloSuperTree*)in_tree;
-		testPartitionModel(params, stree, model_info, fmodel, models_block);
+		testPartitionModel(params, stree, model_info, fmodel, models_block, num_threads);
 //        stree->linkTrees();
         stree->mapTrees();
 		string res_models = "";
@@ -1302,26 +1481,25 @@ string testModel(Params &params, PhyloTree* in_tree, vector<ModelInfo> &model_in
 	}
 
 	in_tree->optimize_by_newton = params.optimize_by_newton;
-	in_tree->setLikelihoodKernel(params.SSE);
+	in_tree->setLikelihoodKernel(params.SSE, num_threads);
 
 //    int num_rate_classes = 3 + params.max_rate_cats;
 
 	RateHeterogeneity ** rate_class = new RateHeterogeneity*[4];
 	rate_class[0] = new RateHeterogeneity();
-	rate_class[1] = new RateInvar(-1, NULL);
-	rate_class[2] = new RateGamma(params.num_rate_cats, params.gamma_shape, params.gamma_median, NULL);
-	rate_class[3] = new RateGammaInvar(params.num_rate_cats, params.gamma_shape, params.gamma_median, -1, params.optimize_alg_gammai, NULL, false);
+	rate_class[1] = new RateInvar(params.p_invar_sites, in_tree);
+	rate_class[2] = new RateGamma(params.num_rate_cats, params.gamma_shape, params.gamma_median, in_tree);
+	rate_class[3] = new RateGammaInvar(params.num_rate_cats, params.gamma_shape, params.gamma_median, -1, params.optimize_alg_gammai, in_tree, false);
     
     RateFree ** rate_class_free = new RateFree*[params.max_rate_cats-1];
     
     for (model = 0; model < params.max_rate_cats-1; model++)
-        rate_class_free[model] = new RateFree(model+2, params.gamma_shape, "", false, params.optimize_alg, NULL);
+        rate_class_free[model] = new RateFree(model+2, params.gamma_shape, "", false, params.optimize_alg, in_tree);
 
     RateFreeInvar ** rate_class_freeinvar = new RateFreeInvar*[params.max_rate_cats-1];
     
     for (model = 0; model < params.max_rate_cats-1; model++) {
-        rate_class_freeinvar[model] = new RateFreeInvar(model+2, params.gamma_shape, "", false, in_tree->aln->frac_const_sites/2.0, params.optimize_alg, NULL);
-        rate_class_freeinvar[model]->setFixPInvar(false);
+        rate_class_freeinvar[model] = new RateFreeInvar(model+2, params.gamma_shape, "", false, params.p_invar_sites, params.optimize_alg, in_tree);
     }
         
         
@@ -1415,7 +1593,7 @@ string testModel(Params &params, PhyloTree* in_tree, vector<ModelInfo> &model_in
             }
         } else {
             // kernel might be changed if mixture model was tested
-            in_tree->setLikelihoodKernel(params.SSE);
+            in_tree->setLikelihoodKernel(params.SSE, num_threads);
             // normal model
             if (model_names[model].find("+ASC") != string::npos) {
                 model_fac->unobserved_ptns = in_tree->aln->getUnobservedConstPatterns();
@@ -1506,6 +1684,13 @@ string testModel(Params &params, PhyloTree* in_tree, vector<ModelInfo> &model_in
         
         tree->clearAllPartialLH();
 
+#ifdef _OPENMP
+    if (num_threads <= 0) {
+        num_threads = tree->testNumThreads();
+        omp_set_num_threads(num_threads);
+    }
+#endif
+
 
 		// optimize model parameters
 		ModelInfo info;        
@@ -1560,6 +1745,13 @@ string testModel(Params &params, PhyloTree* in_tree, vector<ModelInfo> &model_in
                 // set checkpoint
                 iqtree->setCheckpoint(in_tree->getCheckpoint());
                 iqtree->num_precision = in_tree->num_precision;
+
+                // clear all checkpointed information
+                Checkpoint *newCheckpoint = new Checkpoint;
+                iqtree->getCheckpoint()->getSubCheckpoint(newCheckpoint, "iqtree");
+                iqtree->getCheckpoint()->clear();
+                iqtree->getCheckpoint()->insert(newCheckpoint->begin(), newCheckpoint->end());
+                delete newCheckpoint;
                 
                 cout << endl << "===> Testing model " << model+1 << ": " << params.model_name << endl;
                 runTreeReconstruction(params, original_model, *iqtree, model_info);
@@ -1571,7 +1763,7 @@ string testModel(Params &params, PhyloTree* in_tree, vector<ModelInfo> &model_in
                 tree = iqtree;
 
                 // clear all checkpointed information
-                Checkpoint *newCheckpoint = new Checkpoint;
+                newCheckpoint = new Checkpoint;
                 tree->getCheckpoint()->getSubCheckpoint(newCheckpoint, "iqtree");
                 tree->getCheckpoint()->clear();
                 tree->getCheckpoint()->insert(newCheckpoint->begin(), newCheckpoint->end());
@@ -1594,6 +1786,8 @@ string testModel(Params &params, PhyloTree* in_tree, vector<ModelInfo> &model_in
                     tree->fixNegativeBranch(true);
                     tree->clearAllPartialLH();
                 }
+                if (verbose_mode >= VB_MED)
+                    cout << "Optimizing model " << info.name << endl;
                 info.logl = tree->getModelFactory()->optimizeParameters(false, false, TOL_LIKELIHOOD_MODELTEST, TOL_GRADIENT_MODELTEST);
                 info.tree_len = tree->treeLength();
                 if (prev_model_id >= 0) {
@@ -2259,14 +2453,7 @@ void performAUTest(Params &params, PhyloTree *tree, double *pattern_lhs, vector<
                     for (ptn = 0; ptn < nptn; ptn++)
                         tree_lh += pattern_lh[ptn] * boot_sample_dbl[ptn];
                 } else {
-#ifdef BINARY32
-                    tree_lh = tree->dotProductSIMD<double, Vec2d, 2>(pattern_lh, boot_sample_dbl, nptn);
-#else
-                    if (instruction_set >= 7)
-                        tree_lh = tree->dotProductSIMD<double, Vec4d, 4>(pattern_lh, boot_sample_dbl, nptn);
-                    else
-                        tree_lh = tree->dotProductSIMD<double, Vec2d, 2>(pattern_lh, boot_sample_dbl, nptn);
-#endif
+                    tree_lh = tree->dotProductDoubleCall(pattern_lh, boot_sample_dbl, nptn);
                 }
                 // rescale lh
                 tree_lh /= r[k];
@@ -2468,7 +2655,7 @@ void evaluateTrees(Params &params, IQTree *tree, vector<TreeInfo> &info, IntVect
 	cout << endl;
 	//MTreeSet trees(params.treeset_file, params.is_rooted, params.tree_burnin, params.tree_max_count);
 	cout << "Reading trees in " << params.treeset_file << " ..." << endl;
-	int ntrees = countDistinctTrees(params.treeset_file, params.is_rooted, tree, distinct_ids, params.distinct_trees);
+	size_t ntrees = countDistinctTrees(params.treeset_file, params.is_rooted, tree, distinct_ids, params.distinct_trees);
 	if (ntrees < distinct_ids.size()) {
 		cout << "WARNING: " << distinct_ids.size() << " trees detected but only " << ntrees << " distinct trees will be evaluated" << endl;
 	} else {
@@ -2498,10 +2685,22 @@ void evaluateTrees(Params &params, IQTree *tree, vector<TreeInfo> &info, IntVect
 		site_lh_out.close();
 	}
 
+    if (params.print_partition_lh && !tree->isSuperTree()) {
+        outWarning("-wpl does not work with non-partition model");
+        params.print_partition_lh = false;
+    }
+	string part_lh_file = params.out_prefix;
+	part_lh_file += ".partlh";
+	if (params.print_partition_lh) {
+		ofstream part_lh_out(part_lh_file.c_str());
+		part_lh_out << ntrees << " " << ((PhyloSuperTree*)tree)->size() << endl;
+		part_lh_out.close();
+	}
+
 	double time_start = getRealTime();
 
 	int *boot_samples = NULL;
-	int boot;
+	size_t boot;
 	//double *saved_tree_lhs = NULL;
 	double *tree_lhs = NULL; // RELL score matrix of size #trees x #replicates
 	double *pattern_lh = NULL;
@@ -2509,8 +2708,8 @@ void evaluateTrees(Params &params, IQTree *tree, vector<TreeInfo> &info, IntVect
 	double *orig_tree_lh = NULL; // Original tree log-likelihoods
 	double *max_lh = NULL;
 	double *lhdiff_weights = NULL;
-	int nptn = tree->getAlnNPattern();
-    int maxnptn = get_safe_upper_limit(nptn);
+	size_t nptn = tree->getAlnNPattern();
+    size_t maxnptn = get_safe_upper_limit(nptn);
     
 	if (params.topotest_replicates && ntrees > 1) {
 		size_t mem_size = (size_t)params.topotest_replicates*nptn*sizeof(int) +
@@ -2524,8 +2723,22 @@ void evaluateTrees(Params &params, IQTree *tree, vector<TreeInfo> &info, IntVect
 		cout << "Creating " << params.topotest_replicates << " bootstrap replicates..." << endl;
 		if (!(boot_samples = new int [params.topotest_replicates*nptn]))
 			outError(ERR_NO_MEMORY);
+#ifdef _OPENMP
+        #pragma omp parallel private(boot) if(nptn > 10000)
+        {
+        int *rstream;
+        init_random(params.ran_seed + omp_get_thread_num(), false, &rstream);
+        #pragma omp for schedule(static)
+#else
+        int *rstream = randstream;
+#endif
 		for (boot = 0; boot < params.topotest_replicates; boot++)
-			tree->aln->createBootstrapAlignment(boot_samples + (boot*nptn), params.bootstrap_spec);
+			tree->aln->createBootstrapAlignment(boot_samples + (boot*nptn), params.bootstrap_spec, rstream);
+#ifdef _OPENMP
+        finish_random(rstream);
+        }
+#endif
+        cout << "done" << endl;
 		//if (!(saved_tree_lhs = new double [ntrees * params.topotest_replicates]))
 		//	outError(ERR_NO_MEMORY);
 		if (!(tree_lhs = new double [ntrees * params.topotest_replicates]))
@@ -2566,13 +2779,6 @@ void evaluateTrees(Params &params, IQTree *tree, vector<TreeInfo> &info, IntVect
         tree->setRootNode(params.root);
 		if (tree->isSuperTree())
 			((PhyloSuperTree*) tree)->mapTrees();
-//		if ((tree->sse == LK_EIGEN || tree->sse == LK_EIGEN_SSE) && !tree->isBifurcating()) {
-//			cout << "NOTE: Changing to old kernel as user tree is multifurcating" << endl;
-//			if (tree->sse == LK_EIGEN)
-//				tree->changeLikelihoodKernel(LK_NORMAL);
-//			else
-//				tree->changeLikelihoodKernel(LK_SSE);
-//		}
 
 		tree->initializeAllPartialLh();
 		tree->fixNegativeBranch(false);
@@ -2600,6 +2806,10 @@ void evaluateTrees(Params &params, IQTree *tree, vector<TreeInfo> &info, IntVect
 			string tree_name = "Tree" + convertIntToString(tree_index+1);
 			printSiteLh(site_lh_file.c_str(), tree, pattern_lh, true, tree_name.c_str());
 		}
+		if (params.print_partition_lh) {
+			string tree_name = "Tree" + convertIntToString(tree_index+1);
+			printPartitionLh(part_lh_file.c_str(), tree, pattern_lh, true, tree_name.c_str());
+		}
 		info[tid].logl = tree->getCurScore();
 
 		if (!params.topotest_replicates || ntrees <= 1) {
@@ -2612,7 +2822,7 @@ void evaluateTrees(Params &params, IQTree *tree, vector<TreeInfo> &info, IntVect
 		for (boot = 0; boot < params.topotest_replicates; boot++) {
 			double lh = 0.0;
 			int *this_boot_sample = boot_samples + (boot*nptn);
-			for (int ptn = 0; ptn < nptn; ptn++)
+			for (size_t ptn = 0; ptn < nptn; ptn++)
 				lh += pattern_lh[ptn] * this_boot_sample[ptn];
 			tree_lhs_offset[boot] = lh;
 		}
@@ -2693,9 +2903,9 @@ void evaluateTrees(Params &params, IQTree *tree, vector<TreeInfo> &info, IntVect
 		}
 
 		double orig_max_lh = orig_tree_lh[0];
-		int orig_max_id = 0;
+		size_t orig_max_id = 0;
 		double orig_2ndmax_lh = -DBL_MAX;
-		int orig_2ndmax_id = -1;
+		size_t orig_2ndmax_id = -1;
 		// find the max tree ID
 		for (tid = 1; tid < ntrees; tid++)
 			if (orig_max_lh < orig_tree_lh[tid]) {
@@ -2716,7 +2926,7 @@ void evaluateTrees(Params &params, IQTree *tree, vector<TreeInfo> &info, IntVect
 			// SH compute original deviation from max_lh
 			info[tid].kh_pvalue = 0.0;
 			info[tid].sh_pvalue = 0.0;
-			int max_id = (tid != orig_max_id) ? orig_max_id : orig_2ndmax_id;
+			size_t max_id = (tid != orig_max_id) ? orig_max_id : orig_2ndmax_id;
 			double orig_diff = orig_tree_lh[max_id] - orig_tree_lh[tid] - avg_lh[tid];
 			double *max_kh = tree_lhs + (max_id * params.topotest_replicates);
 			for (boot = 0; boot < params.topotest_replicates; boot++) {
@@ -2752,7 +2962,7 @@ void evaluateTrees(Params &params, IQTree *tree, vector<TreeInfo> &info, IntVect
 				info[tid].wkh_pvalue = 0.0;
 				info[tid].wsh_pvalue = 0.0;
 				double worig_diff = -DBL_MAX;
-				int max_id = -1;
+				size_t max_id = -1;
 				for (tid2 = 0; tid2 < ntrees; tid2++)
 					if (tid2 != tid) {
 						double wdiff = (orig_tree_lh[tid2] - orig_tree_lh[tid])*lhdiff_weights[tid*ntrees+tid2];
diff --git a/phylotesting.h b/phylotesting.h
index 657a06d..63f9889 100644
--- a/phylotesting.h
+++ b/phylotesting.h
@@ -69,7 +69,7 @@ bool checkModelFile(string model_file, bool is_partitioned, vector<ModelInfo> &i
  @return name of best-fit-model
  */
 string testModel(Params &params, PhyloTree* in_tree, vector<ModelInfo> &model_info, ostream &fmodel,
-		ModelsBlock *models_block, string set_name = "", bool print_mem_usage = false);
+		ModelsBlock *models_block, int num_threads, string set_name = "", bool print_mem_usage = false);
 
 /**
  * print site log likelihoods to a fileExists
@@ -83,6 +83,17 @@ void printSiteLh(const char*filename, PhyloTree *tree, double *ptn_lh = NULL,
 		bool append = false, const char *linename = NULL);
 
 /**
+ * print partition log likelihoods to a file
+ * @param filename output file name
+ * @param tree phylogenetic tree
+ * @param ptn_lh pattern log-likelihoods, will be computed if NULL
+ * @param append TRUE to append to existing file, FALSE otherwise
+ * @param linename name of the line, default "Site_Lh" if NULL
+ */
+void printPartitionLh(const char*filename, PhyloTree *tree, double *ptn_lh = NULL,
+		bool append = false, const char *linename = NULL);
+
+/**
  * print site log likelihoods per category to a file
  * @param filename output file name
  * @param tree phylogenetic tree
@@ -104,6 +115,14 @@ void printSiteProbCategory(const char*filename, PhyloTree *tree, SiteLoglType ws
 void printSiteStateFreq(const char*filename, PhyloTree *tree, double *state_freqs = NULL);
 
 /**
+    print ancestral sequences
+    @param filename output file name
+    @param tree phylogenetic tree
+    @param ast either AST_MARGINAL or AST_JOINT
+*/
+void printAncestralSequences(const char*filename, PhyloTree *tree, AncestralSeqType ast);
+
+/**
  * Evaluate user-trees with possibility of tree topology tests
  * @param params program parameters
  * @param tree current tree
diff --git a/phylotree.cpp b/phylotree.cpp
index b9143ce..5cd4c55 100644
--- a/phylotree.cpp
+++ b/phylotree.cpp
@@ -1,15 +1,24 @@
-//
-// C++ Implementation: phylotree
-//
-// Description:
-//
-//
-// Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>, (C) 2008
-//
-// Copyright: See COPYING file that comes with this distribution
-//
-//
-
+/***************************************************************************
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
 #include "phylotree.h"
 #include "bionj.h"
 //#include "rateheterogeneity.h"
@@ -21,8 +30,11 @@
 #include "phylosupertree.h"
 #include "phylosupertreeplen.h"
 #include "upperbounds.h"
+#include "MPIHelper.h"
 #include "model/modelmixture.h"
 
+const int LH_MIN_CONST = 1;
+
 //const static int BINARY_SCALE = floor(log2(1/SCALING_THRESHOLD));
 //const static double LOG_BINARY_SCALE = -(log(2) * BINARY_SCALE);
 
@@ -70,26 +82,24 @@ void PhyloTree::init() {
     nni_scale_num = NULL;
     central_partial_pars = NULL;
     model_factory = NULL;
-//    tmp_partial_lh1 = NULL;
-//    tmp_partial_lh2 = NULL;
-//    tmp_anscentral_state_prob1 = NULL;
-//    tmp_anscentral_state_prob2 = NULL;
-    //tmp_ptn_rates = NULL;
-    //state_freqs = NULL;
-//    tmp_scale_num1 = NULL;
-//    tmp_scale_num2 = NULL;
     discard_saturated_site = true;
     _pattern_lh = NULL;
     _pattern_lh_cat = NULL;
     //root_state = STATE_UNKNOWN;
     root_state = 126;
     theta_all = NULL;
+    buffer_scale_all = NULL;
+    buffer_partial_lh = NULL;
     ptn_freq = NULL;
     ptn_invar = NULL;
     subTreeDistComputed = false;
     dist_matrix = NULL;
     var_matrix = NULL;
-    setLikelihoodKernel(LK_EIGEN_SSE);  // FOR TUNG: you forgot to initialize this variable!
+    params = NULL;
+    setLikelihoodKernel(LK_EIGEN_SSE, 1);  // FOR TUNG: you forgot to initialize this variable!
+    sse = LK_EIGEN_SSE;
+    num_threads = 0;
+    max_lh_slots = 0;
     save_all_trees = 0;
     nodeBranchDists = NULL;
     // FOR: upper bounds
@@ -110,6 +120,7 @@ void PhyloTree::init() {
     current_scaling = 1.0;
     is_opt_scaling = false;
     num_partial_lh_computations = 0;
+    vector_size = 0;
 }
 
 PhyloTree::PhyloTree(Alignment *aln) : MTree(), CheckpointFactory() {
@@ -117,6 +128,15 @@ PhyloTree::PhyloTree(Alignment *aln) : MTree(), CheckpointFactory() {
     this->aln = aln;
 }
 
+PhyloTree::PhyloTree(string& treeString, Alignment* aln, bool isRooted) : MTree() {
+    stringstream str;
+    str << treeString;
+    str.seekg(0, ios::beg);
+    freeNode();
+    readTree(str, isRooted);
+    setAlignment(aln);
+}
+
 void PhyloTree::saveCheckpoint() {
     checkpoint->startStruct("PhyloTree");
     StrVector leafNames;
@@ -191,20 +211,6 @@ PhyloTree::~PhyloTree() {
     if (site_rate)
         delete site_rate;
     site_rate = NULL;
-//    if (tmp_scale_num1)
-//        delete[] tmp_scale_num1;
-//    if (tmp_scale_num2)
-//        delete[] tmp_scale_num2;
-//    if (tmp_partial_lh1)
-//        delete[] tmp_partial_lh1;
-//    if (tmp_partial_lh2)
-//        delete[] tmp_partial_lh2;
-//    if (tmp_anscentral_state_prob1)
-//        delete[] tmp_anscentral_state_prob1;
-//    if (tmp_anscentral_state_prob2)
-//        delete[] tmp_anscentral_state_prob2;
-    //if (tmp_ptn_rates)
-    //	delete [] tmp_ptn_rates;
     if (_pattern_lh_cat)
         aligned_free(_pattern_lh_cat);
     _pattern_lh_cat = NULL;
@@ -216,6 +222,12 @@ PhyloTree::~PhyloTree() {
     if (theta_all)
         aligned_free(theta_all);
     theta_all = NULL;
+    if (buffer_scale_all)
+        aligned_free(buffer_scale_all);
+    buffer_scale_all = NULL;
+    if (buffer_partial_lh)
+        aligned_free(buffer_partial_lh);
+    buffer_partial_lh = NULL;
     if (ptn_freq)
         aligned_free(ptn_freq);
     ptn_freq = NULL;
@@ -330,7 +342,10 @@ void PhyloTree::setAlignment(Alignment *alignment) {
             node->id = seq;
         }
     }
-    if (err) outError("Tree taxa and alignment sequence do not match (see above)");
+    if (err) {
+        printTree(cout, WT_NEWLINE);
+        outError("Tree taxa and alignment sequence do not match (see above)");
+    }
     StrVector taxname;
     getTaxaName(taxname);
     for (StrVector::iterator it = taxname.begin(); it != taxname.end(); it++)
@@ -351,31 +366,30 @@ void PhyloTree::setRootNode(const char *my_root) {
     assert(root);
 }
 
-void PhyloTree::setParams(Params* params) {
-	this->params = params;
-}
+//void PhyloTree::setParams(Params* params) {
+//	this->params = params;
+//}
 
 void PhyloTree::readTreeString(const string &tree_string) {
 	stringstream str(tree_string);
-//	str(tree_string);
-//	str.seekg(0, ios::beg);
 	freeNode();
     
     // bug fix 2016-04-14: in case taxon name happens to be ID
 	MTree::readTree(str, rooted);
     
     assignLeafNames();
-//	setAlignment(aln);
-	setRootNode(params->root);
+	setRootNode(Params::getInstance().root);
 
 	if (isSuperTree()) {
 		((PhyloSuperTree*) this)->mapTrees();
 	}
-	if (params->pll) {
+	if (Params::getInstance().pll) {
 		pllReadNewick(getTreeString());
 	}
 	resetCurScore();
-//	lhComputed = false;
+    if (Params::getInstance().fixStableSplits || Params::getInstance().adaptPertubation) {
+        buildNodeSplit();
+    }
 }
 
 void PhyloTree::readTreeStringSeqName(const string &tree_string) {
@@ -396,6 +410,9 @@ void PhyloTree::readTreeStringSeqName(const string &tree_string) {
 	}
 	resetCurScore();
 //	lhComputed = false;
+    if (params->fixStableSplits) {
+        buildNodeSplit();
+    }
 }
 
 int PhyloTree::wrapperFixNegativeBranch(bool force_change) {
@@ -440,11 +457,16 @@ string PhyloTree::getTreeString() {
 	return tree_stream.str();
 }
 
-string PhyloTree::getTopology() {
+string PhyloTree::getTopologyString(bool printBranchLength) {
     stringstream tree_stream;
     // important: to make topology string unique
     setRootNode(params->root);
-    printTree(tree_stream, WT_TAXON_ID + WT_SORT_TAXA);
+    //printTree(tree_stream, WT_TAXON_ID + WT_SORT_TAXA);
+    if (printBranchLength) {
+        printTree(tree_stream, WT_SORT_TAXA + WT_BR_LEN + WT_TAXON_ID);
+    } else {
+        printTree(tree_stream, WT_SORT_TAXA);
+    }
     return tree_stream.str();
 }
 
@@ -466,7 +488,7 @@ void PhyloTree::setModel(ModelSubst *amodel) {
 void PhyloTree::setModelFactory(ModelFactory *model_fac) {
     model_factory = model_fac;
     if (model_factory && (model_factory->model->isMixture() || model_factory->model->isSiteSpecificModel()))
-    	setLikelihoodKernel(sse);
+    	setLikelihoodKernel(sse, num_threads);
 }
 
 void PhyloTree::setRate(RateHeterogeneity *rate) {
@@ -501,6 +523,7 @@ void PhyloTree::clearAllPartialLH(bool make_null) {
     current_it = current_it_back = NULL;
 }
 
+/*
 void PhyloTree::computeAllPartialLh(PhyloNode *node, PhyloNode *dad) {
 	if (!node) node = (PhyloNode*)root;
 	FOR_NEIGHBOR_IT(node, dad, it) {
@@ -512,6 +535,7 @@ void PhyloTree::computeAllPartialLh(PhyloNode *node, PhyloNode *dad) {
 		computeAllPartialLh((PhyloNode*)(*it)->node, node);
 	}
 }
+*/
 
 string PhyloTree::getModelName() {
 	string name = model->getName();
@@ -625,7 +649,7 @@ void PhyloTree::initializeAllPartialPars(int &index, PhyloNode *node, PhyloNode
 size_t PhyloTree::getBitsBlockSize() {
     // reserve the last entry for parsimony score
 //    return (aln->num_states * aln->size() + UINT_BITS - 1) / UINT_BITS + 1;
-    size_t len = aln->num_states * ((max(aln->size(), (size_t)aln->num_informative_sites) + SIMD_BITS - 1) / UINT_BITS) + 4;
+    size_t len = aln->getMaxNumStates() * ((max(aln->size(), (size_t)aln->num_informative_sites) + SIMD_BITS - 1) / UINT_BITS) + 4;
     len = ((len+7)/8)*8;
     return len;
 }
@@ -664,19 +688,23 @@ int PhyloTree::computeParsimony() {
  likelihood function
  ****************************************************************************/
 
+size_t PhyloTree::getBufferPartialLhSize() {
+    const size_t VECTOR_SIZE = 8; // TODO, adjusted
+    size_t ncat_mix = site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
+    size_t block = model->num_states * ncat_mix;
+    size_t buffer_size = get_safe_upper_limit(block * model->num_states * 2 * aln->getNSeq());
+    buffer_size += get_safe_upper_limit(block * (aln->getNSeq()+1) * (aln->STATE_UNKNOWN+1));
+    buffer_size += (block*2+model->num_states)*VECTOR_SIZE*num_threads;
+    return buffer_size;
+}
+
 void PhyloTree::initializeAllPartialLh() {
     int index, indexlh;
     int numStates = model->num_states;
 	// Minh's question: why getAlnNSite() but not getAlnNPattern() ?
     //size_t mem_size = ((getAlnNSite() % 2) == 0) ? getAlnNSite() : (getAlnNSite() + 1);
-    size_t nptn = getAlnNPattern() + numStates; // extra #numStates for ascertainment bias correction
-
-    size_t mem_size;
-    if (instruction_set >= 7)
-    	mem_size = ((nptn +3)/4)*4;
-    else
-    	mem_size = ((nptn % 2) == 0) ? nptn : (nptn + 1);
-
+    // extra #numStates for ascertainment bias correction
+    size_t mem_size = get_safe_upper_limit(getAlnNPattern()) + get_safe_upper_limit(numStates);
     size_t block_size = mem_size * numStates * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
     // make sure _pattern_lh size is divisible by 4 (e.g., 9->12, 14->16)
     if (!_pattern_lh)
@@ -685,32 +713,26 @@ void PhyloTree::initializeAllPartialLh() {
         _pattern_lh_cat = aligned_alloc<double>(mem_size * site_rate->getNDiscreteRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures()));
     if (!theta_all)
         theta_all = aligned_alloc<double>(block_size);
+    if (!buffer_scale_all)
+        buffer_scale_all = aligned_alloc<double>(mem_size);
+    if (!buffer_partial_lh) {
+        buffer_partial_lh = aligned_alloc<double>(getBufferPartialLhSize());
+    }
     if (!ptn_freq) {
         ptn_freq = aligned_alloc<double>(mem_size);
         ptn_freq_computed = false;
     }
     if (!ptn_invar)
         ptn_invar = aligned_alloc<double>(mem_size);
-    bool benchmark_mem = (!central_partial_lh && verbose_mode >= VB_MED);
-    if (benchmark_mem) {
-    	cout << "Measuring run time for allocating " << getMemoryRequired() << " bytes RAM" << endl;
-    }
-    double cpu_start_time = getCPUTime();
-    double wall_start_time = getRealTime();
     initializeAllPartialLh(index, indexlh);
-    if (benchmark_mem) {
-    	cout << "CPU time for initializeAllPartialLh: " << getCPUTime() - cpu_start_time << " sec" << endl;
-    	cout << "Wall-clock time for initializeAllPartialLh: " << getRealTime() - wall_start_time << " sec" << endl;
-    }
+    if (params->lh_mem_save == LM_MEM_SAVE)
+        mem_slots.init(this, max_lh_slots);
+        
     assert(index == (nodeNum - 1) * 2);
-    if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
-        if (params->lh_mem_save == LM_PER_NODE) {
-            assert(indexlh == nodeNum-leafNum);
-        } else {
-            assert(indexlh == (nodeNum-1)*2-leafNum);
-        }
-    } else
-    	assert(indexlh == (nodeNum-1)*2);
+    if (params->lh_mem_save == LM_PER_NODE) {
+        assert(indexlh == nodeNum-leafNum);
+    }
+
     clearAllPartialLH();
 
 }
@@ -739,7 +761,10 @@ void PhyloTree::deleteAllPartialLh() {
 		aligned_free(ptn_freq);
 	if (theta_all)
 		aligned_free(theta_all);
-
+    if (buffer_scale_all)
+        aligned_free(buffer_scale_all);
+    if (buffer_partial_lh)
+        aligned_free(buffer_partial_lh);
 	if (_pattern_lh_cat)
 		aligned_free(_pattern_lh_cat);
 	if (_pattern_lh)
@@ -752,6 +777,8 @@ void PhyloTree::deleteAllPartialLh() {
 	ptn_freq = NULL;
 	ptn_freq_computed = false;
 	theta_all = NULL;
+    buffer_scale_all = NULL;
+    buffer_partial_lh = NULL;
 	_pattern_lh_cat = NULL;
 	_pattern_lh = NULL;
 
@@ -760,103 +787,106 @@ void PhyloTree::deleteAllPartialLh() {
     clearAllPartialLH();
 }
  
-uint64_t PhyloTree::getMemoryRequired(size_t ncategory) {
-	size_t nptn = aln->getNPattern() + aln->num_states; // +num_states for ascertainment bias correction
-	uint64_t block_size;
-	if (instruction_set >= 7)
-		// block size must be divisible by 4
-		block_size = ((nptn+3)/4)*4;
-	else
-		// block size must be divisible by 2
-		block_size = ((nptn % 2) == 0) ? nptn : (nptn + 1);
-    block_size = block_size * aln->num_states;
+uint64_t PhyloTree::getMemoryRequired(size_t ncategory, bool full_mem) {
+    // +num_states for ascertainment bias correction
+	int64_t nptn = get_safe_upper_limit(aln->getNPattern()) + get_safe_upper_limit(aln->num_states);
+    int64_t scale_block_size = nptn;
     if (site_rate)
-    	block_size *= site_rate->getNRate();
+    	scale_block_size *= site_rate->getNRate();
     else
-    	block_size *= ncategory;
+    	scale_block_size *= ncategory;
     if (model && !model_factory->fused_mix_rate)
-    	block_size *= model->getNMixtures();
-    uint64_t mem_size = ((uint64_t) leafNum*4) * block_size *sizeof(double) + 2 + (leafNum) * 4 * nptn * sizeof(UBYTE);
-    if (params->SSE == LK_EIGEN || params->SSE == LK_EIGEN_SSE) {
-    	mem_size -= ((uint64_t)leafNum) * ((uint64_t)block_size*sizeof(double) + nptn * sizeof(UBYTE));
-        if (params->lh_mem_save == LM_PER_NODE) {
-            mem_size -= ((uint64_t)leafNum*2 - 4) * ((uint64_t)block_size*sizeof(double) + nptn * sizeof(UBYTE));
-        }
-    }
-	uint64_t tip_partial_lh_size;
+    	scale_block_size *= model->getNMixtures();
+
+    int64_t block_size = scale_block_size * aln->num_states;
+
+    int64_t mem_size;
+    // memory to tip_partial_lh
     if (model)
-        tip_partial_lh_size = aln->num_states * (aln->STATE_UNKNOWN+1) * model->getNMixtures() * sizeof(double);
+        mem_size = aln->num_states * (aln->STATE_UNKNOWN+1) * model->getNMixtures() * sizeof(double);
     else
-        tip_partial_lh_size = aln->num_states * (aln->STATE_UNKNOWN+1) * sizeof(double);
-    mem_size += tip_partial_lh_size;
+        mem_size = aln->num_states * (aln->STATE_UNKNOWN+1) * sizeof(double);
+
+    // memory for UFBoot
     if (params->gbo_replicates)
         mem_size += params->gbo_replicates*nptn*sizeof(BootValType);
+
+    // memory for model
     if (model)
     	mem_size += model->getMemoryRequired();
+
+    int64_t lh_scale_size = block_size * sizeof(double) + scale_block_size * sizeof(UBYTE);
+
+    max_lh_slots = leafNum-2;
+
+    if (!full_mem && params->lh_mem_save == LM_MEM_SAVE) {
+        int64_t min_lh_slots = log2(leafNum)+LH_MIN_CONST;
+        if (params->max_mem_size == 0.0) {
+            max_lh_slots = min_lh_slots;
+        } else if (params->max_mem_size <= 1) {
+            max_lh_slots = floor(params->max_mem_size*(leafNum-2));
+        } else {
+            int64_t rest_mem = params->max_mem_size - mem_size;
+            
+            // include 2 blocks for nni_partial_lh
+            max_lh_slots = rest_mem / lh_scale_size - 2;
+
+            // RAM over requirement, reset to LM_PER_NODE
+            if (max_lh_slots > leafNum-2)
+                max_lh_slots = leafNum-2;
+        }
+        if (max_lh_slots < min_lh_slots) {
+            cout << "WARNING: Too low -mem, automatically increased to " << (mem_size + (min_lh_slots+2)*lh_scale_size)/1048576.0 << " MB" << endl;
+            max_lh_slots = min_lh_slots;
+        }
+    }
+
+    // also count MEM for nni_partial_lh
+    mem_size += (max_lh_slots+2) * lh_scale_size;
+
+
     return mem_size;
 }
 
 void PhyloTree::getMemoryRequired(uint64_t &partial_lh_entries, uint64_t &scale_num_entries, uint64_t &partial_pars_entries) {
-	size_t nptn = aln->getNPattern() + aln->num_states; // +num_states for ascertainment bias correction
-	uint64_t block_size;
-	if (instruction_set >= 7)
-		// block size must be divisible by 4
-		block_size = ((nptn+3)/4)*4;
-	else
-		// block size must be divisible by 2
-		block_size = ((nptn % 2) == 0) ? nptn : (nptn + 1);
+    // +num_states for ascertainment bias correction
+	uint64_t block_size = get_safe_upper_limit(aln->getNPattern()) + get_safe_upper_limit(aln->num_states);
+    size_t scale_size = block_size;
     block_size = block_size * aln->num_states;
-    if (site_rate)
+    if (site_rate) {
     	block_size *= site_rate->getNRate();
-    if (model && !model_factory->fused_mix_rate)
+        scale_size *= site_rate->getNRate();
+    }
+    if (model && !model_factory->fused_mix_rate) {
     	block_size *= model->getNMixtures();
+        scale_size *= model->getNMixtures();
+    }
 
 	uint64_t tip_partial_lh_size = aln->num_states * (aln->STATE_UNKNOWN+1) * model->getNMixtures();
-    if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
-        if (params->lh_mem_save == LM_PER_NODE)
-            partial_lh_entries = ((uint64_t)leafNum - 2) * (uint64_t) block_size + 4 + tip_partial_lh_size;
-        else
-            partial_lh_entries = ((uint64_t)leafNum * 3 - 6) * (uint64_t) block_size + 4 + tip_partial_lh_size;
-    } else
-    	partial_lh_entries = ((uint64_t)leafNum * 4 - 6) * (uint64_t) block_size + 4 + tip_partial_lh_size;
 
-
-	if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
-        if (params->lh_mem_save == LM_PER_NODE)
-            scale_num_entries = (leafNum - 2) * nptn;
-        else
-            scale_num_entries = (leafNum*3 - 4) * nptn;
-	} else
-		scale_num_entries = (leafNum*4 - 4) * nptn;
+    // TODO mem save
+    partial_lh_entries = ((uint64_t)leafNum - 2) * (uint64_t) block_size + 4 + tip_partial_lh_size;
+    scale_num_entries = (leafNum - 2) * scale_size;
 
     size_t pars_block_size = getBitsBlockSize();
     partial_pars_entries = (leafNum - 1) * 4 * pars_block_size;
 }
 
 void PhyloTree::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node, PhyloNode *dad) {
-    size_t pars_block_size = getBitsBlockSize();
-    size_t nptn = aln->size()+aln->num_states; // +num_states for ascertainment bias correction
-    size_t block_size;
-    if (instruction_set >= 7)
-    	// block size must be divisible by 4
-    	nptn = ((nptn+3)/4)*4;
-	else
-		// block size must be divisible by 2
-		nptn = ((nptn % 2) == 0) ? nptn : (nptn + 1);
+    uint64_t pars_block_size = getBitsBlockSize();
+    // +num_states for ascertainment bias correction
+    size_t nptn = get_safe_upper_limit(aln->size())+ get_safe_upper_limit(aln->num_states);
+    uint64_t block_size;
+    uint64_t scale_block_size = nptn * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
+    block_size = scale_block_size * model->num_states;
 
-    size_t scale_block_size = nptn;
-//    size_t tip_block_size = nptn * model->num_states;
-
-    block_size = nptn * model->num_states * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
     if (!node) {
         node = (PhyloNode*) root;
         // allocate the big central partial likelihoods memory
+//        size_t IT_NUM = (params->nni5) ? 6 : 2;
+        size_t IT_NUM = 2;
         if (!nni_partial_lh) {
             // allocate memory only once!
-//            intptr_t MEM_ALIGNMENT = (instruction_set >= 7) ? 32 : 16;
-//            nni_partial_lh = aligned_alloc<double>(IT_NUM*partial_lh_size+MEM_ALIGNMENT/sizeof(double));
-//            nni_scale_num = aligned_alloc<UBYTE>(IT_NUM*scale_num_size+MEM_ALIGNMENT/sizeof(UBYTE));
-            size_t IT_NUM = (params->nni5) ? 6 : 2;
             nni_partial_lh = aligned_alloc<double>(IT_NUM*block_size);
             nni_scale_num = aligned_alloc<UBYTE>(IT_NUM*scale_block_size);
         }
@@ -864,16 +894,14 @@ void PhyloTree::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node
 
         if (!central_partial_lh) {
         	uint64_t tip_partial_lh_size = aln->num_states * (aln->STATE_UNKNOWN+1) * model->getNMixtures();
-            if (model->isSiteSpecificModel() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE))
+            if (model->isSiteSpecificModel())
                 tip_partial_lh_size = get_safe_upper_limit(aln->size()) * model->num_states * leafNum;
-            uint64_t mem_size = ((uint64_t)leafNum * 4 - 6) * (uint64_t) block_size + 2 + tip_partial_lh_size;
-            if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
-                if (params->lh_mem_save == LM_PER_NODE) {
-                    mem_size -= ((uint64_t)leafNum * 3 - 4) * (uint64_t)block_size;
-                } else {
-                    mem_size -= (uint64_t)leafNum * (uint64_t)block_size;
-                }
-            }
+
+            if (max_lh_slots == 0)
+                getMemoryRequired();
+
+            uint64_t mem_size = (uint64_t)max_lh_slots * block_size + 4 + tip_partial_lh_size;
+
             if (verbose_mode >= VB_MED)
                 cout << "Allocating " << mem_size * sizeof(double) << " bytes for partial likelihood vectors" << endl;
             try {
@@ -886,24 +914,15 @@ void PhyloTree::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node
         }
 
         // now always assign tip_partial_lh
-        if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
-            if (params->lh_mem_save == LM_PER_NODE) {
-                tip_partial_lh = central_partial_lh + ((nodeNum - leafNum)*block_size);
-            } else {
-                tip_partial_lh = central_partial_lh + (((nodeNum - 1)*2-leafNum)*block_size);
-            }
-        } else
-            tip_partial_lh = central_partial_lh + (((nodeNum - 1)*2)*block_size);
+        if (params->lh_mem_save == LM_PER_NODE) {
+            tip_partial_lh = central_partial_lh + ((nodeNum - leafNum)*block_size);
+        } else {
+            tip_partial_lh = central_partial_lh + (max_lh_slots*block_size);
+        }
 
         if (!central_scale_num) {
-        	uint64_t mem_size = (leafNum - 1) * 4 * scale_block_size;
-        	if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
-                if (params->lh_mem_save == LM_PER_NODE) {
-                    mem_size -= ((uint64_t)leafNum*3 - 2) * (uint64_t) scale_block_size;
-                } else {
-                    mem_size -= (uint64_t)leafNum * (uint64_t) scale_block_size;
-                }
-            }
+        	uint64_t mem_size = max_lh_slots * scale_block_size;
+
             if (verbose_mode >= VB_MED)
                 cout << "Allocating " << mem_size * sizeof(UBYTE) << " bytes for scale num vectors" << endl;
             try {
@@ -943,7 +962,7 @@ void PhyloTree::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node
         assert(index < nodeNum * 2 - 1);
         
         // now initialize partial_lh and scale_num
-        if (params->lh_mem_save == LM_PER_NODE && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
+        if (params->lh_mem_save == LM_PER_NODE) {
             if (!node->isLeaf()) { // only allocate memory to internal node
                 nei->partial_lh = NULL; // do not allocate memory for tip, use tip_partial_lh instead
                 nei->scale_num = NULL;
@@ -957,24 +976,18 @@ void PhyloTree::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node
                 nei2->partial_lh = NULL;
             }
         } else {
-            if (nei->node->isLeaf() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
-                nei->partial_lh = NULL; // do not allocate memory for tip, use tip_partial_lh instead
-                nei->scale_num = NULL;
-            } else {
-                nei->scale_num = central_scale_num + (indexlh * scale_block_size);
-                nei->partial_lh = central_partial_lh + (indexlh * block_size);
-                indexlh++;
-            }
-            if (nei2->node->isLeaf() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
-                nei2->partial_lh = NULL; // do not allocate memory for tip, use tip_partial_lh instead
-                nei2->scale_num = NULL;
-            } else {
-                nei2->scale_num = central_scale_num + ((indexlh) * scale_block_size);
-                nei2->partial_lh = central_partial_lh + (indexlh * block_size);
-                indexlh++;
-            }
+            nei->partial_lh = NULL;
+            nei->scale_num = NULL;
+            nei2->scale_num = NULL;
+            nei2->partial_lh = NULL;
         }
-        
+
+        // zero memory to allocate contiguous chunk of memory
+//        if (nei->partial_lh)
+//            memset(nei->partial_lh, 0, block_size*sizeof(double));
+//        if (nei2->partial_lh)
+//            memset(nei2->partial_lh, 0, block_size*sizeof(double));
+
 //        if (model->isSiteSpecificModel() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
 //            // allocate tip memory for this model
 //            if (node->isLeaf()) {
@@ -989,32 +1002,31 @@ void PhyloTree::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node
 }
 
 double *PhyloTree::newPartialLh() {
-    double *ret = aligned_alloc<double>((aln->size()+aln->num_states+3) * aln->num_states * site_rate->getNRate() *
-                             ((model_factory->fused_mix_rate)? 1 : model->getNMixtures()));
-    return ret;
+    return aligned_alloc<double>(getPartialLhSize());
 }
 
-size_t PhyloTree::getPartialLhBytes() {
-    size_t nptn = aln->size()+aln->num_states; // +num_states for ascertainment bias correction
-    size_t block_size;
-    if (instruction_set >= 7)
-    	// block size must be divisible by 4
-    	block_size = ((nptn+3)/4)*4;
-	else
-		// block size must be divisible by 2
-		block_size = ((nptn % 2) == 0) ? nptn : (nptn + 1);
+size_t PhyloTree::getPartialLhSize() {
+    // +num_states for ascertainment bias correction
+    size_t block_size = get_safe_upper_limit(aln->size())+get_safe_upper_limit(aln->num_states);
+    block_size *= model->num_states * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
+	return block_size;
+}
 
-    block_size = block_size * model->num_states * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
+size_t PhyloTree::getPartialLhBytes() {
+    // +num_states for ascertainment bias correction
+	return getPartialLhSize() * sizeof(double);
+}
 
-	return block_size * sizeof(double);
+size_t PhyloTree::getScaleNumSize() {
+	return (get_safe_upper_limit(aln->size())+get_safe_upper_limit(aln->num_states)) * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
 }
 
 size_t PhyloTree::getScaleNumBytes() {
-	return (aln->size()+aln->num_states) * sizeof(UBYTE);
+	return getScaleNumSize()*sizeof(UBYTE);
 }
 
 UBYTE *PhyloTree::newScaleNum() {
-    return aligned_alloc<UBYTE>(aln->size()+aln->num_states);
+    return aligned_alloc<UBYTE>(getScaleNumSize());
 }
 
 Node *findFirstFarLeaf(Node *node, Node *dad = NULL) {
@@ -1108,58 +1120,90 @@ int PhyloTree::getNumLhCat(SiteLoglType wsl) {
     }
 }
 
+void PhyloTree::transformPatternLhCat() {
+    if (vector_size == 1)
+        return;
+
+    size_t nptn = ((aln->size()+vector_size-1)/vector_size)*vector_size;
+//    size_t nstates = aln->num_states;
+    size_t ncat = site_rate->getNRate();
+    if (!model_factory->fused_mix_rate) ncat *= model->getNMixtures();
+
+    double *mem = aligned_alloc<double>(nptn*ncat);
+    memcpy(mem, _pattern_lh_cat, nptn*ncat*sizeof(double));
+    double *memptr = mem;
+
+    size_t ptn, cat, i;
+    for (ptn = 0; ptn < nptn; ptn+=vector_size) {
+        double *lh_cat_ptr = &_pattern_lh_cat[ptn*ncat];
+        for (cat = 0; cat < ncat; cat++) {
+            for (i = 0; i < vector_size; i++)
+                lh_cat_ptr[i*ncat+cat] = memptr[i];
+            memptr += vector_size;
+        }
+    }
+    aligned_free(mem);
+}
+
 double PhyloTree::computePatternLhCat(SiteLoglType wsl) {
     if (!current_it) {
         Node *leaf = findFirstFarLeaf(root);
         current_it = (PhyloNeighbor*)leaf->neighbors[0];
         current_it_back = (PhyloNeighbor*)current_it->node->findNeighbor(leaf);
     }
-//    if (sse == LK_NORMAL || sse == LK_SSE) {
-//        if (getModel()->isMixture())
-//            outError("Naive kernel does not support mixture models, contact author if you really need this feature");
-//        return computeLikelihoodBranchNaive(current_it, (PhyloNode*)current_it_back->node);
-//    } else 
-    if (!getModel()->isMixture())
-        return computeLikelihoodBranchEigen(current_it, (PhyloNode*)current_it_back->node);
+
+    double score;
+
+    score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
+    // TODO: SIMD aware
+    transformPatternLhCat();
+    /*
+    if (getModel()->isSiteSpecificModel()) {
+        score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
+    } else if (!getModel()->isMixture())
+        score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
     else if (getModelFactory()->fused_mix_rate)
-        return computeMixrateLikelihoodBranchEigen(current_it, (PhyloNode*)current_it_back->node);
+        score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
     else {
-        double score = computeMixtureLikelihoodBranchEigen(current_it, (PhyloNode*)current_it_back->node);
-        if (wsl == WSL_MIXTURE_RATECAT) return score;
-        
-        double *lh_cat = _pattern_lh_cat;
-        double *lh_res = _pattern_lh_cat;
-        size_t ptn, nptn = aln->getNPattern();
-        size_t m, nmixture = getModel()->getNMixtures();
-        size_t c, ncat = getRate()->getNRate();
-        if (wsl == WSL_MIXTURE && ncat > 1) {
-            // transform to lh per mixture class
-            for (ptn = 0; ptn < nptn; ptn++) {
-                for (m = 0; m < nmixture; m++) {
-                    double lh = lh_cat[0];
-                    for (c = 1; c < ncat; c++)
-                        lh += lh_cat[c];
-                    lh_res[m] = lh;
-                    lh_cat += ncat;
+        score = computeLikelihoodBranch(current_it, (PhyloNode*)current_it_back->node);
+    */
+    if (!getModel()->isSiteSpecificModel() && getModel()->isMixture() && !getModelFactory()->fused_mix_rate) {
+        if (wsl == WSL_MIXTURE || wsl == WSL_RATECAT) {
+            double *lh_cat = _pattern_lh_cat;
+            double *lh_res = _pattern_lh_cat;
+            size_t ptn, nptn = aln->getNPattern();
+            size_t m, nmixture = getModel()->getNMixtures();
+            size_t c, ncat = getRate()->getNRate();
+            if (wsl == WSL_MIXTURE && ncat > 1) {
+                // transform to lh per mixture class
+                for (ptn = 0; ptn < nptn; ptn++) {
+                    for (m = 0; m < nmixture; m++) {
+                        double lh = lh_cat[0];
+                        for (c = 1; c < ncat; c++)
+                            lh += lh_cat[c];
+                        lh_res[m] = lh;
+                        lh_cat += ncat;
+                    }
+                    lh_res += nmixture;
                 }
-                lh_res += nmixture;
-            }
-        } else if (wsl == WSL_RATECAT && nmixture > 1) {
-            // transform to lh per rate category
-            for (ptn = 0; ptn < nptn; ptn++) {
-                if (lh_res != lh_cat)
-                    memcpy(lh_res, lh_cat, ncat*sizeof(double));
-                lh_cat += ncat;
-                for (m = 1; m < nmixture; m++) {
-                    for (c = 0; c < ncat; c++)
-                        lh_res[c] += lh_cat[c];
+            } else if (wsl == WSL_RATECAT && nmixture > 1) {
+                // transform to lh per rate category
+                for (ptn = 0; ptn < nptn; ptn++) {
+                    if (lh_res != lh_cat)
+                        memcpy(lh_res, lh_cat, ncat*sizeof(double));
                     lh_cat += ncat;
+                    for (m = 1; m < nmixture; m++) {
+                        for (c = 0; c < ncat; c++)
+                            lh_res[c] += lh_cat[c];
+                        lh_cat += ncat;
+                    }
+                    lh_res += ncat;
                 }
-                lh_res += ncat;
             }
         }
-        return score;
     }
+    
+    return score;
 }
 
 void PhyloTree::computePatternStateFreq(double *ptn_state_freq) {
@@ -1255,7 +1299,12 @@ void PhyloTree::computePatternLikelihood(double *ptn_lh, double *cur_logl, doubl
     } else {
         memmove(ptn_lh, _pattern_lh, nptn * sizeof(double));
     }
-    if (ptn_lh_cat) {
+
+    if (!ptn_lh_cat)
+        return;
+
+    /*
+    if (ptn_lh_cat && model->isSiteSpecificModel()) {
     	int offset = 0;
     	if (sum_scaling == 0.0) {
     		int nptncat = nptn * ncat;
@@ -1282,7 +1331,74 @@ void PhyloTree::computePatternLikelihood(double *ptn_lh, double *cur_logl, doubl
 					ptn_lh_cat[offset] = log(_pattern_lh_cat[offset]) + scale;
 			}
     	}
+        return;
+    }
+    */
+    
+    // New kernel
+    int ptn;
+    PhyloNeighbor *nei1 = current_it;
+    PhyloNeighbor *nei2 = current_it_back;
+    if (!nei1->node->isLeaf() && nei2->node->isLeaf()) {
+        // exchange
+        PhyloNeighbor *tmp = nei1;
+        nei1 = nei2;
+        nei2 = tmp;
+    }
+    if (nei1->node->isLeaf()) {
+        // external branch
+        double *lh_cat = _pattern_lh_cat;
+        double *out_lh_cat = ptn_lh_cat;
+        UBYTE *nei2_scale = nei2->scale_num;
+        if (params->lk_safe_scaling || leafNum >= params->numseq_safe_scaling) {
+            // per-category scaling
+            for (ptn = 0; ptn < nptn; ptn++) {
+                for (i = 0; i < ncat; i++) {
+                    out_lh_cat[i] = log(lh_cat[i]) + nei2_scale[i] * LOG_SCALING_THRESHOLD;
+                }
+                lh_cat += ncat;
+                out_lh_cat += ncat;
+                nei2_scale += ncat;
+            }
+        } else {
+            // normal scaling
+            for (ptn = 0; ptn < nptn; ptn++) {
+                double scale = nei2_scale[ptn] * LOG_SCALING_THRESHOLD;
+                for (i = 0; i < ncat; i++)
+                    out_lh_cat[i] = log(lh_cat[i]) + scale;
+                lh_cat += ncat;
+                out_lh_cat += ncat;
+            }
+        }
+    } else {
+        // internal branch
+        double *lh_cat = _pattern_lh_cat;
+        double *out_lh_cat = ptn_lh_cat;
+        UBYTE *nei1_scale = nei1->scale_num;
+        UBYTE *nei2_scale = nei2->scale_num;
+        if (params->lk_safe_scaling || leafNum >= params->numseq_safe_scaling) {
+            // per-category scaling
+            for (ptn = 0; ptn < nptn; ptn++) {
+                for (i = 0; i < ncat; i++) {
+                    out_lh_cat[i] = log(lh_cat[i]) + (nei1_scale[i]+nei2_scale[i]) * LOG_SCALING_THRESHOLD;
+                }
+                lh_cat += ncat;
+                out_lh_cat += ncat;
+                nei1_scale += ncat;
+                nei2_scale += ncat;
+            }
+        } else {
+            // normal scaling
+            for (ptn = 0; ptn < nptn; ptn++) {
+                double scale = (nei1_scale[ptn] + nei2_scale[ptn]) * LOG_SCALING_THRESHOLD;
+                for (i = 0; i < ncat; i++)
+                    out_lh_cat[i] = log(lh_cat[i]) + scale;
+                lh_cat += ncat;
+                out_lh_cat += ncat;
+            }
+        }
     }
+
 //    if (cur_logl) {
 //        double check_score = 0.0;
 //        for (int i = 0; i < nptn; i++) {
@@ -2010,10 +2126,11 @@ double PhyloTree::computeBayesianBranchLength(PhyloNeighbor *dad_branch, PhyloNo
      if (node->isLeaf() || dad->isLeaf()) {
      return -1.0;
      }*/
-    if ((dad_branch->partial_lh_computed & 1) == 0)
-        computePartialLikelihood(dad_branch, dad);
-    if ((node_branch->partial_lh_computed & 1) == 0)
-        computePartialLikelihood(node_branch, node);
+     // TODO
+//    if ((dad_branch->partial_lh_computed & 1) == 0)
+//        computePartialLikelihood(dad_branch, dad);
+//    if ((node_branch->partial_lh_computed & 1) == 0)
+//        computePartialLikelihood(node_branch, node);
     // now combine likelihood at the branch
     int nstates = aln->num_states;
     int numCat = site_rate->getNRate();
@@ -2227,6 +2344,7 @@ void PhyloTree::optimizeOneBranch(PhyloNode *node1, PhyloNode *node2, bool clear
     double ferror, optx;
     assert(current_len >= 0.0);
     theta_computed = false;
+//    mem_slots.cleanup();
     if (optimize_by_newton) {
     	// Newton-Raphson method
     	optx = minimizeNewton(params->min_branch_length, current_len, params->max_branch_length, params->min_branch_length, negative_lh, maxNRStep);
@@ -2342,8 +2460,12 @@ double PhyloTree::optimizeAllBranches(int my_iterations, double tolerance, int m
 //            printTree(cout, WT_BR_LEN+WT_NEWLINE);
 //        }
 
-        for (int j = 0; j < nodes.size(); j++)
+        for (int j = 0; j < nodes.size(); j++) {
             optimizeOneBranch((PhyloNode*)nodes[j], (PhyloNode*)nodes2[j]);
+            if (verbose_mode >= VB_MAX) {
+                cout << "Branch " << nodes[j]->id << " " << nodes2[j]->id << ": " << computeLikelihoodFromBuffer() << endl;
+            }
+        }
 
 //        if (i == 0) 
 //            optimizeOneBranch((PhyloNode*)nodes[0], (PhyloNode*)nodes2[0]);
@@ -2796,52 +2918,52 @@ int PhyloTree::fixNegativeBranch(bool force, Node *node, Node *dad) {
  Nearest Neighbor Interchange by maximum likelihood
  ****************************************************************************/
 
-void PhyloTree::doOneRandomNNI(Node *node1, Node *node2) {
-	assert(isInnerBranch(node1, node2));
-    Neighbor *node1Nei = NULL;
-    Neighbor *node2Nei = NULL;
-    // randomly choose one neighbor from node1 and one neighbor from node2
-    bool chooseNext = false;
-	FOR_NEIGHBOR_IT(node1, node2, it){
-		if (chooseNext) {
-			node1Nei = (*it);
-			break;
-		}
-         
-		int randNum = random_int(2); // randNum is either 0 or 1
-		if (randNum == 0) {
-			node1Nei = (*it);
-			break;
-		} else {
-			chooseNext = true;
-		}
+void PhyloTree::doOneRandomNNI(Branch branch) {
+	assert(isInnerBranch(branch.first, branch.second));
+    NNIMove nni;
+    nni.node1 = (PhyloNode*) branch.first;
+    nni.node2 = (PhyloNode*) branch.second;
+	FOR_NEIGHBOR_IT(branch.first, branch.second, node1NeiIt) {
+		nni.node1Nei_it = node1NeiIt;
+		break;
 	}
-	chooseNext = false;
-	FOR_NEIGHBOR_IT(node2, node1, it){
-		if (chooseNext) {
-			node2Nei = (*it);
-			break;
-		}
-		int randNum = random_int(2);
-		if (randNum == 0) {
-			node2Nei = (*it);
+    int randInt = random_int(branch.second->neighbors.size()-1);
+    int cnt = 0;
+	FOR_NEIGHBOR_IT(branch.second, branch.first, node2NeiIt) {
+		if (cnt == randInt) {
+			nni.node2Nei_it = node2NeiIt;
 			break;
 		} else {
-			chooseNext = true;
+			cnt++;
 		}
 	}
-	assert(node1Nei != NULL && node2Nei != NULL);
-
-    NeighborVec::iterator node1NeiIt = node1->findNeighborIt(node1Nei->node);
-    NeighborVec::iterator node2NeiIt = node2->findNeighborIt(node2Nei->node);
-    assert(node1NeiIt != node1->neighbors.end());
-    assert(node1NeiIt != node2->neighbors.end());
-
-    node1->updateNeighbor(node1NeiIt, node2Nei);
-    node2Nei->node->updateNeighbor(node2, node1);
+    if (constraintTree.isCompatible(nni))
+        doNNI(nni, true);
+}
 
-    node2->updateNeighbor(node2NeiIt, node1Nei);
-    node1Nei->node->updateNeighbor(node1, node2);
+    
+NNIMove PhyloTree::getRandomNNI(Branch &branch) {
+    assert(isInnerBranch(branch.first, branch.second));
+    NNIMove nni;
+    nni.node1 = (PhyloNode*) branch.first;
+    nni.node2 = (PhyloNode*) branch.second;
+
+    FOR_NEIGHBOR_IT(branch.first, branch.second, node1NeiIt) {
+            nni.node1Nei_it = node1NeiIt;
+            break;
+        }
+    int randInt = random_int(branch.second->neighbors.size()-1);
+    int cnt = 0;
+    FOR_NEIGHBOR_IT(branch.second, branch.first, node2NeiIt) {
+            if (cnt == randInt) {
+                nni.node2Nei_it = node2NeiIt;
+                break;
+            } else {
+                cnt++;
+            }
+        }
+    nni.newloglh = 0.0;
+    return nni;
 }
 
 void PhyloTree::doNNI(NNIMove &move, bool clearLH) {
@@ -2871,9 +2993,9 @@ void PhyloTree::doNNI(NNIMove &move, bool clearLH) {
     PhyloNeighbor *node21_it = (PhyloNeighbor*) node2->findNeighbor(node1); // return neighbor of node2 which points to node 1
 
     // reorient partial_lh before swap
-    if (params->lh_mem_save == LM_PER_NODE && !isSuperTree() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
-        node12_it->reorientPartialLh(node1);
-        node21_it->reorientPartialLh(node2);
+    if (!isSuperTree()) {
+        reorientPartialLh(node12_it, node1);
+        reorientPartialLh(node21_it, node2);
     }
     
     // do the NNI swap
@@ -2896,11 +3018,14 @@ void PhyloTree::doNNI(NNIMove &move, bool clearLH) {
      outError("Wrong ID");
      }*/
 
+    PhyloNeighbor *nei12 = (PhyloNeighbor*) node1->findNeighbor(node2); // return neighbor of node1 which points to node 2
+    PhyloNeighbor *nei21 = (PhyloNeighbor*) node2->findNeighbor(node1); // return neighbor of node2 which points to node 1
 
     if (clearLH) {
         // clear partial likelihood vector
-        node12_it->clearPartialLh();
-        node21_it->clearPartialLh();
+        nei12->clearPartialLh();
+        nei21->clearPartialLh();
+        nei12->size = nei21->size = 0;
 
         node2->clearReversePartialLh(node1);
         node1->clearReversePartialLh(node2);
@@ -2911,6 +3036,20 @@ void PhyloTree::doNNI(NNIMove &move, bool clearLH) {
     if (params->leastSquareNNI) {
     	updateSubtreeDists(move);
     }
+
+    // update split store in node
+    if (nei12->split != NULL || nei21->split != NULL) {
+        delete nei12->split;
+        nei12->split = new Split(leafNum);
+        delete nei21->split;
+        nei21->split = new Split(leafNum);
+
+        FOR_NEIGHBOR_IT(nei12->node, node1, it)
+                *(nei12->split) += *((*it)->split);
+
+        FOR_NEIGHBOR_IT(nei21->node, node2, it)
+                *(nei21->split) += *((*it)->split);
+    }
 }
 
 void PhyloTree::changeNNIBrans(NNIMove nnimove) {
@@ -2953,13 +3092,15 @@ NNIMove PhyloTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove
     size_t partial_lh_size = getPartialLhBytes()/sizeof(double);
     size_t scale_num_size = getScaleNumBytes()/sizeof(UBYTE);
 
+
     // Upper Bounds ---------------
-    totalNNIub += 2;
+    /*
     if(params->upper_bound_NNI){
+    	totalNNIub += 2;
     	NNIMove resMove;
     	resMove = getBestNNIForBranUB(node1,node2,this);
-    	/* if UB is smaller than the current likelihood, then we don't recompute the likelihood of the swapped topology.
-    	 * Otherwise, follow the normal procedure: evaluate NNIs and compute the likelihood.*/
+    	// if UB is smaller than the current likelihood, then we don't recompute the likelihood of the swapped topology.
+        // Otherwise, follow the normal procedure: evaluate NNIs and compute the likelihood.
 
     	// here, we skip NNI is its UB n times worse than the curLikelihood
     	if( resMove.newloglh < (1+params->upper_bound_frac)*this->curScore){
@@ -2967,7 +3108,8 @@ NNIMove PhyloTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove
     		return resMove;
     	}
     }
-
+    */
+    
     //-----------------------------
 
 	NeighborVec::iterator it;
@@ -2988,14 +3130,20 @@ NNIMove PhyloTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove
 	assert(id == IT_NUM);
 
 	Neighbor *saved_nei[6];
+    int mem_id = 0;
 	// save Neighbor and allocate new Neighbor pointer
 	for (id = 0; id < IT_NUM; id++) {
 		saved_nei[id] = (*saved_it[id]);
 		*saved_it[id] = new PhyloNeighbor(saved_nei[id]->node, saved_nei[id]->length);
-		((PhyloNeighbor*) (*saved_it[id]))->partial_lh = nni_partial_lh + id*partial_lh_size;
-		((PhyloNeighbor*) (*saved_it[id]))->scale_num = nni_scale_num + id*scale_num_size;
+        if (((PhyloNeighbor*)saved_nei[id])->partial_lh) {
+            ((PhyloNeighbor*) (*saved_it[id]))->partial_lh = nni_partial_lh + mem_id*partial_lh_size;
+            ((PhyloNeighbor*) (*saved_it[id]))->scale_num = nni_scale_num + mem_id*scale_num_size;
+            mem_id++;
+            mem_slots.addSpecialNei((PhyloNeighbor*)*saved_it[id]);
+        }
 //		((PhyloNeighbor*) (*saved_it[id]))->scale_num = newScaleNum();
 	}
+    assert(mem_id == 2);
 
 	// get the Neighbor again since it is replaced for saving purpose
 	PhyloNeighbor* node12_it = (PhyloNeighbor*) node1->findNeighbor(node2);
@@ -3037,22 +3185,36 @@ NNIMove PhyloTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove
     // Initialize node1 and node2 in nniMoves
 	nniMoves[0].node1 = nniMoves[1].node1 = node1;
 	nniMoves[0].node2 = nniMoves[1].node2 = node2;
+    nniMoves[0].newloglh = nniMoves[1].newloglh = -DBL_MAX;
 
     double backupScore = curScore;
 
-    for (cnt = 0; cnt < 2; cnt++) {
+    for (cnt = 0; cnt < 2; cnt++) if (constraintTree.isCompatible(nniMoves[cnt])) 
+    {
         // do the NNI swap
     	NeighborVec::iterator node1_it = nniMoves[cnt].node1Nei_it;
     	NeighborVec::iterator node2_it = nniMoves[cnt].node2Nei_it;
         Neighbor *node1_nei = *node1_it;
         Neighbor *node2_nei = *node2_it;
 
+        // reorient partial_lh before swap
+        if (!isSuperTree()) {
+            reorientPartialLh(node12_it, node1);
+            reorientPartialLh(node21_it, node2);
+        }
+
         node1->updateNeighbor(node1_it, node2_nei);
         node2_nei->node->updateNeighbor(node2, node1);
 
         node2->updateNeighbor(node2_it, node1_nei);
         node1_nei->node->updateNeighbor(node1, node2);
 
+        if (params->lh_mem_save == LM_MEM_SAVE) {
+            // reset subtree size to change traversal order
+            for (id = 0; id < IT_NUM; id++)
+                ((PhyloNeighbor*)*saved_it[id])->size = 0;
+        }
+
 		// clear partial likelihood vector
 		node12_it->clearPartialLh();
 		node21_it->clearPartialLh();
@@ -3088,6 +3250,8 @@ NNIMove PhyloTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove
 			node12_it->clearPartialLh();
 		}
 		double score = computeLikelihoodFromBuffer();
+        if (verbose_mode >= VB_DEBUG)
+            cout << "NNI " << node1->id << " - " << node2->id << ": " << score << endl;
 		nniMoves[cnt].newloglh = score;
 		// compute the pattern likelihoods if wanted
 		if (nniMoves[cnt].ptnlh)
@@ -3097,6 +3261,12 @@ NNIMove PhyloTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove
 			saveCurrentTree(score); // BQM: for new bootstrap
 		}
 
+        // reorient partial_lh before swap
+        if (!isSuperTree()) {
+            reorientPartialLh(node12_it, node1);
+            reorientPartialLh(node21_it, node2);
+        }
+
         // else, swap back, also recover the branch lengths
 		node1->updateNeighbor(node1_it, node1_nei);
 		node1_nei->node->updateNeighbor(node2, node1);
@@ -3110,12 +3280,21 @@ NNIMove PhyloTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove
 	 for (id = IT_NUM-1; id >= 0; id--) {
 //		 aligned_free(((PhyloNeighbor*) *saved_it[id])->scale_num);
 		 //delete[] ((PhyloNeighbor*) *saved_it[id])->partial_lh;
+//         if (((PhyloNeighbor*)saved_nei[id])->partial_lh) {
+//            if (saved_nei[id]->node == node1)
+//                mem_slots.restore(node21_it, (PhyloNeighbor*)saved_nei[id]);
+//            else
+//                mem_slots.restore(node12_it, (PhyloNeighbor*)saved_nei[id]);
+//         }
 		 if (*saved_it[id] == current_it) current_it = (PhyloNeighbor*) saved_nei[id];
 		 if (*saved_it[id] == current_it_back) current_it_back = (PhyloNeighbor*) saved_nei[id];
 
 		 delete (*saved_it[id]);
 		 (*saved_it[id]) = saved_nei[id];
 	 }
+
+    mem_slots.eraseSpecialNei();
+
 //	 aligned_free(new_partial_lh);
 
 	 // restore the length of 4 branches around node1, node2
@@ -4469,36 +4648,89 @@ void PhyloTree::computeSeqIdentityAlongTree() {
 }
 
 void PhyloTree::generateRandomTree(TreeGenType tree_type) {
+    if (!constraintTree.empty() && tree_type != YULE_HARDING)
+        outError("Only Yule-Harding ramdom tree supported with constraint tree");
     assert(aln);
     int orig_size = params->sub_size;
     params->sub_size = aln->getNSeq();
     MExtTree ext_tree;
-	switch (tree_type) {
-	case YULE_HARDING: 
-		ext_tree.generateYuleHarding(*params);
-		break;
-	case UNIFORM:
-		ext_tree.generateUniform(params->sub_size);
-		break;
-	case CATERPILLAR:
-		ext_tree.generateCaterpillar(params->sub_size);
-		break;
-	case BALANCED:
-		ext_tree.generateBalanced(params->sub_size);
-		break;
-	case STAR_TREE:
-		ext_tree.generateStarTree(*params);
-		break;
-	default:
-		break;
-	}
+    if (constraintTree.empty()) {
+        switch (tree_type) {
+        case YULE_HARDING: 
+            ext_tree.generateYuleHarding(*params);
+            break;
+        case UNIFORM:
+            ext_tree.generateUniform(params->sub_size);
+            break;
+        case CATERPILLAR:
+            ext_tree.generateCaterpillar(params->sub_size);
+            break;
+        case BALANCED:
+            ext_tree.generateBalanced(params->sub_size);
+            break;
+        case STAR_TREE:
+            ext_tree.generateStarTree(*params);
+            break;
+        default:
+            break;
+        }
+        NodeVector taxa;
+        ext_tree.getTaxa(taxa);
+        assert(taxa.size() == aln->getNSeq());
+        for (NodeVector::iterator it = taxa.begin(); it != taxa.end(); it++)
+            (*it)->name = aln->getSeqName((*it)->id);
+    } else {
+        ext_tree.generateConstrainedYuleHarding(*params, &constraintTree, aln->getSeqNames());
+    }
     params->sub_size = orig_size;
-	NodeVector taxa;
-	ext_tree.getTaxa(taxa);
-	assert(taxa.size() == aln->getNSeq());
-	for (NodeVector::iterator it = taxa.begin(); it != taxa.end(); it++)
-		(*it)->name = aln->getSeqName((*it)->id);
     stringstream str;
     ext_tree.printTree(str);
     PhyloTree::readTreeStringSeqName(str.str());
 }
+
+/*
+void PhyloTree::sortNeighborBySubtreeSize(PhyloNode *node, PhyloNode *dad) {
+
+    // already sorted, return
+    PhyloNeighbor *nei = (PhyloNeighbor*)dad->findNeighbor(node);
+    if (nei->size >= 1)
+        return;
+
+    if (dad && node->isLeaf()) {
+        nei->size = 1;
+        return;
+    }
+
+    nei->size = 0;
+    FOR_NEIGHBOR_DECLARE(node, dad, it) {
+        sortNeighborBySubtreeSize((PhyloNode*)(*it)->node, node);
+        nei->size += ((PhyloNeighbor*)*it)->size;
+    }
+    
+    // sort neighbors in descending order of sub-tree size
+    FOR_NEIGHBOR(node, dad, it)
+        for (NeighborVec::iterator it2 = it+1; it2 != node->neighbors.end(); it2++)
+            if ((*it2)->node != dad && ((PhyloNeighbor*)*it)->size < ((PhyloNeighbor*)*it2)->size) {
+                Neighbor *nei;
+                nei = *it;
+                *it = *it2;
+                *it2 = nei;
+            }
+}
+*/
+
+void PhyloTree::reorientPartialLh(PhyloNeighbor* dad_branch, Node *dad) {
+    if (dad_branch->partial_lh)
+        return;
+    Node * node = dad_branch->node;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        PhyloNeighbor *backnei = (PhyloNeighbor*)(*it)->node->findNeighbor(node);
+        if (backnei->partial_lh) {
+            mem_slots.takeover(dad_branch, backnei);
+            break;
+        }
+    }
+    if (params->lh_mem_save == LM_PER_NODE)
+        assert(dad_branch->partial_lh && "partial_lh is not re-oriented");
+}
+
diff --git a/phylotree.h b/phylotree.h
index 69187c3..09f66a9 100644
--- a/phylotree.h
+++ b/phylotree.h
@@ -1,14 +1,24 @@
-//
-// C++ Interface: phylotree
-//
-// Description:
-//
-//
-// Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>, (C) 2008
-//
-// Copyright: See COPYING file that comes with this distribution
-//
-//
+/***************************************************************************
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
 
 #ifndef PHYLOTREE_H
 #define PHYLOTREE_H
@@ -30,8 +40,11 @@
 #include "phylonode.h"
 #include "optimization.h"
 #include "model/rateheterogeneity.h"
+#include "candidateset.h"
 #include "pll/pll.h"
 #include "checkpoint.h"
+#include "constrainttree.h"
+#include "memslot.h"
 
 #define BOOT_VAL_FLOAT
 #define BootValType float
@@ -39,6 +52,8 @@
 
 extern int instruction_set;
 
+#define SAFE_LH   true  // safe likelihood scaling to avoid numerical underflow for ultra large trees
+#define NORM_LH  false // normal likelihood scaling
 
 const double TOL_BRANCH_LEN = 0.000001; // NEVER TOUCH THIS CONSTANT AGAIN PLEASE!
 const double TOL_LIKELIHOOD = 0.001; // NEVER TOUCH THIS CONSTANT AGAIN PLEASE!
@@ -58,6 +73,10 @@ const int SPR_DEPTH = 2;
 //using namespace Eigen;
 
 inline size_t get_safe_upper_limit(size_t cur_limit) {
+	if (instruction_set >= 9)
+		// AVX-512
+		return ((cur_limit+7)/8)*8;
+	else
 	if (instruction_set >= 7)
 		// AVX
 		return ((cur_limit+3)/4)*4;
@@ -67,6 +86,10 @@ inline size_t get_safe_upper_limit(size_t cur_limit) {
 }
 
 inline size_t get_safe_upper_limit_float(size_t cur_limit) {
+	if (instruction_set >= 9)
+		// AVX
+		return ((cur_limit+15)/16)*16;
+	else
 	if (instruction_set >= 7)
 		// AVX
 		return ((cur_limit+7)/8)*8;
@@ -89,7 +112,7 @@ inline size_t get_safe_upper_limit_float(size_t cur_limit) {
 
 template< class T>
 inline T *aligned_alloc(size_t size) {
-	size_t MEM_ALIGNMENT = (instruction_set >= 7) ? 32 : 16;
+	size_t MEM_ALIGNMENT = (instruction_set >= 9) ? 64 : ((instruction_set >= 7) ? 32 : 16);
     void *mem;
 
 #if defined WIN32 || defined _WIN32 || defined __WIN32__
@@ -150,6 +173,30 @@ typedef std::map< int, PhyloNode* > IntPhyloNodeMap;
 
 const int MAX_SPR_MOVES = 20;
 
+struct NNIMove {
+
+    // Two nodes representing the central branch
+    PhyloNode *node1, *node2;
+
+    // Roots of the two subtree that are swapped
+    NeighborVec::iterator node1Nei_it, node2Nei_it;
+
+    // log-likelihood of the tree after applying the NNI
+    double newloglh;
+
+    int swap_id;
+
+    // new branch lengths of 5 branches corresponding to the NNI
+    double newLen[5];
+
+    // pattern likelihoods
+    double *ptnlh;
+
+    bool operator<(const NNIMove & rhs) const {
+        return newloglh > rhs.newloglh;
+    }
+};
+
 /**
         an SPR move.
  */
@@ -204,34 +251,6 @@ struct SwapNNIParam {
     double *nni2_ptnlh;
 };
 
-struct NNIMove {
-    // Two nodes representing the central branch
-    PhyloNode *node1, *node2;
-    // Roots of the two subtree that are swapped
-    NeighborVec::iterator node1Nei_it, node2Nei_it;
-
-    // log-likelihood of the tree after applying the NNI
-    double newloglh;
-
-    int swap_id;
-
-    // old branch lengths of 5 branches before doing NNI
-    //double oldLen[5];
-
-    // new branch lengths of 5 branches corresponding to the NNI
-    double newLen[5];
-
-    // pattern likelihoods
-    double *ptnlh;
-
-    bool operator<(const NNIMove & rhs) const {
-        return newloglh > rhs.newloglh;
-        //return delta > rhs.delta;
-    }
-};
-
-
-
 struct LeafFreq {
     int leaf_id;
 
@@ -299,6 +318,28 @@ struct SeqQuartetInfo {
 // END definitions for likelihood mapping (HAS)
 // ********************************************
 
+
+// ********************************************
+// BEGIN traversal information
+// ********************************************
+
+class TraversalInfo {
+public:
+    PhyloNeighbor *dad_branch;
+    PhyloNode *dad;
+    double *echildren;
+    double *partial_lh_leaves;
+
+    TraversalInfo(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+        this->dad = dad;
+        this->dad_branch = dad_branch;
+    }
+};
+
+// ********************************************
+// END traversal information
+// ********************************************
+
 /**
 Phylogenetic Tree class
 
@@ -313,6 +354,8 @@ class PhyloTree : public MTree, public Optimization, public CheckpointFactory {
 	friend class RateKategory;
     friend class ModelMixture;
     friend class RateFree;
+    friend class MemSlotVector;
+    friend class ModelFactory;
 
 public:
     /**
@@ -328,6 +371,12 @@ public:
      */
     PhyloTree(Alignment *aln);
 
+    /**
+     *  Create a phylotree from the tree string and assign alignment.
+     *  Taxa IDs are numbered according to their orders in the alignment.
+     */
+    PhyloTree(string& treeString, Alignment *aln, bool isRooted);
+
     void init();
 
     /**
@@ -483,7 +532,7 @@ public:
     /****************************************************************************
             Dot product
      ****************************************************************************/
-    template <class Numeric, class VectorClass, const int VCSIZE>
+    template <class Numeric, class VectorClass>
     Numeric dotProductSIMD(Numeric *x, Numeric *y, int size);
 
     typedef BootValType (PhyloTree::*DotProductType)(BootValType *x, BootValType *y, int size);
@@ -492,11 +541,21 @@ public:
     typedef double (PhyloTree::*DotProductDoubleType)(double *x, double *y, int size);
     DotProductDoubleType dotProductDouble;
 
+    double dotProductDoubleCall(double *x, double *y, int size);
+
 #if defined(BINARY32) || defined(__NOAVX__)
     void setDotProductAVX() {}
+    void setDotProductFMA() {}
 #else
     void setDotProductAVX();
+    void setDotProductFMA();
+#ifdef INCLUDE_AVX512
+    void setDotProductAVX512();
+#endif
 #endif
+
+    void setDotProductSSE();
+
     /**
             this function return the parsimony or likelihood score of the tree. Default is
             to compute the parsimony score. Override this function if you define a new
@@ -577,10 +636,14 @@ public:
     virtual void setParsimonyKernelAVX();
 #endif
 
+    virtual void setParsimonyKernelSSE();
+
     /****************************************************************************
             likelihood function
      ****************************************************************************/
 
+    size_t getBufferPartialLhSize();
+
     /**
             initialize partial_lh vector of all PhyloNeighbors, allocating central_partial_lh
      */
@@ -623,6 +686,7 @@ public:
 
     /** get the number of bytes occupied by partial_lh */
     size_t getPartialLhBytes();
+    size_t getPartialLhSize();
 
     /**
             allocate memory for a scale num vector
@@ -631,6 +695,7 @@ public:
 
     /** get the number of bytes occupied by scale_num */
     size_t getScaleNumBytes();
+    size_t getScaleNumSize();
 
     /**
      * this stores partial_lh for each state at the leaves of the tree because they are the same between leaves
@@ -641,38 +706,84 @@ public:
 
     bool ptn_freq_computed;
 
+    /** vector size used by SIMD kernel */
+    size_t vector_size;
+
+    /** number of threads used for likelihood kernel */
+    int num_threads;
+
+
+    /****************************************************************************
+            helper functions for computing tree traversal
+     ****************************************************************************/
+
+
+    /**
+        compute traversal_info of a subtree
+    */
+    inline bool computeTraversalInfo(PhyloNeighbor *dad_branch, PhyloNode *dad, double* &buffer);
+
+
+    /**
+        compute traversal_info of both subtrees
+    */
+    template<class VectorClass, const int nstates>
+    void computeTraversalInfo(PhyloNode *node, PhyloNode *dad, bool compute_partial_lh);
+    template<class VectorClass>
+    void computeTraversalInfo(PhyloNode *node, PhyloNode *dad, bool compute_partial_lh);
+
+    /**
+        precompute info for models
+    */
+    template<class VectorClass, const int nstates>
+    void computePartialInfo(TraversalInfo &info, VectorClass* buffer);
+    template<class VectorClass>
+    void computePartialInfo(TraversalInfo &info, VectorClass* buffer);
+
+    /** 
+        sort neighbor in descending order of subtree size (number of leaves within subree)
+        @param node the starting node, NULL to start from the root
+        @param dad dad of the node, used to direct the search
+    */
+    void sortNeighborBySubtreeSize(PhyloNode *node, PhyloNode *dad);
+
     /****************************************************************************
             computing partial (conditional) likelihood of subtrees
      ****************************************************************************/
 
+    /** transform _pattern_lh_cat from "interleaved" to "sequential", due to vector_size > 1 */
+    void transformPatternLhCat();
+
     void computeTipPartialLikelihood();
     void computePtnInvar();
     void computePtnFreq();
 
+
     /**
             compute the partial likelihood at a subtree
             @param dad_branch the branch leading to the subtree
             @param dad its dad, used to direct the tranversal
      */
-    virtual void computePartialLikelihood(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
-    typedef void (PhyloTree::*ComputePartialLikelihoodType)(PhyloNeighbor *, PhyloNode *);
+    virtual void computePartialLikelihood(TraversalInfo &info, size_t ptn_left, size_t ptn_right, int thread_id);
+    typedef void (PhyloTree::*ComputePartialLikelihoodType)(TraversalInfo &info, size_t ptn_left, size_t ptn_right, int thread_id);
     ComputePartialLikelihoodType computePartialLikelihoodPointer;
 
 
     //template <const int nstates>
-    void computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
-    
-    //template <const int nstates>
-    void computeMixturePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+//    void computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
 
-    //template <const int nstates>
-    void computeMixratePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+//    void computeSitemodelPartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
 
-    void computeSitemodelPartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+//    template <class VectorClass, const int VCSIZE, const int nstates>
+//    void computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
 
-    template <class VectorClass, const int VCSIZE, const int nstates>
-    void computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+    template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA = false, const bool SITE_MODEL = false>
+    void computePartialLikelihoodSIMD(TraversalInfo &info, size_t ptn_left, size_t ptn_right, int thread_id);
 
+    template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA = false, const bool SITE_MODEL = false>
+    void computePartialLikelihoodGenericSIMD(TraversalInfo &info, size_t ptn_left, size_t ptn_right, int thread_id);
+
+    /*
     template <class VectorClass, const int VCSIZE, const int nstates>
     void computeMixratePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
 
@@ -681,6 +792,7 @@ public:
 
     template <class VectorClass, const int VCSIZE, const int nstates>
     void computeSitemodelPartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+    */
 
     /****************************************************************************
             computing likelihood on a branch
@@ -705,19 +817,20 @@ public:
 //    inline double computeLikelihoodBranchFast(PhyloNeighbor *dad_branch, PhyloNode *dad);
 
     //template <const int nstates>
-    double computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad);
+//    double computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad);
 
-    //template <const int nstates>
-    double computeMixtureLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad);
+//    double computeSitemodelLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad);
 
-    //template <const int nstates>
-    double computeMixrateLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad);
+//    template <class VectorClass, const int VCSIZE, const int nstates>
+//    double computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
 
-    double computeSitemodelLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad);
+    template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA = false, const bool SITE_MODEL = false>
+    double computeLikelihoodBranchSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
 
-    template <class VectorClass, const int VCSIZE, const int nstates>
-    double computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
+    template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA = false, const bool SITE_MODEL = false>
+    double computeLikelihoodBranchGenericSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
 
+    /*
     template <class VectorClass, const int VCSIZE, const int nstates>
     double computeMixrateLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
 
@@ -726,6 +839,7 @@ public:
 
     template <class VectorClass, const int VCSIZE, const int nstates>
     double computeSitemodelLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
+    */
 
     /****************************************************************************
             computing likelihood on a branch using buffer
@@ -742,9 +856,16 @@ public:
     typedef double (PhyloTree::*ComputeLikelihoodFromBufferType)();
     ComputeLikelihoodFromBufferType computeLikelihoodFromBufferPointer;
 
-    template <class VectorClass, const int VCSIZE, const int nstates>
-    double computeLikelihoodFromBufferEigenSIMD();
+//    template <class VectorClass, const int VCSIZE, const int nstates>
+//    double computeLikelihoodFromBufferEigenSIMD();
+
+    template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA = false, const bool SITE_MODEL = false>
+    double computeLikelihoodFromBufferSIMD();
+
+    template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA = false, const bool SITE_MODEL = false>
+    double computeLikelihoodFromBufferGenericSIMD();
 
+    /*
     template <class VectorClass, const int VCSIZE, const int nstates>
     double computeMixrateLikelihoodFromBufferEigenSIMD();
 
@@ -755,6 +876,7 @@ public:
     double computeSitemodelLikelihoodFromBufferEigenSIMD();
 
     double computeSitemodelLikelihoodFromBufferEigen();
+    */
 
     /**
             compute tree likelihood when a branch length collapses to zero
@@ -798,6 +920,43 @@ public:
     */
     void computePatternStateFreq(double *ptn_state_freq);
 
+    /****************************************************************************
+            ancestral sequence reconstruction
+     ****************************************************************************/
+
+    /**
+        compute ancestral sequence probability for an internal node by marginal reconstruction
+        (Yang, Kumar and Nei 1995)
+        @param dad_branch branch leading to an internal node where to obtain ancestral sequence
+        @param dad dad of the target internal node
+        @param[out] ptn_ancestral_prob pattern ancestral probability vector of dad_branch->node
+    */
+    void computeMarginalAncestralProbability(PhyloNeighbor *dad_branch, PhyloNode *dad, double *ptn_ancestral_prob);
+
+    /**
+     	 compute the joint ancestral states at a pattern (Pupko et al. 2000)
+     */
+    void computeJointAncestralSequences(int *ancestral_seqs);
+
+    /**
+     * compute max ancestral likelihood according to
+     *  step 1-3 of the dynamic programming algorithm of Pupko et al. 2000, MBE 17:890-896
+     *  @param dad_branch branch leading to an internal node where to obtain ancestral sequence
+     *  @param dad dad of the target internal node
+     *  @param[out] C array storing all information about max ancestral states
+     */
+    void computeAncestralLikelihood(PhyloNeighbor *dad_branch, PhyloNode *dad, int *C);
+
+    /**
+     * compute max ancestral states according to
+     *  step 4-5 of the dynamic programming algorithm of Pupko et al. 2000, MBE 17:890-896
+     *  @param dad_branch branch leading to an internal node where to obtain ancestral sequence
+     *  @param dad dad of the target internal node
+     *  @param C array storing all information about max ancestral states
+     *  @param[out] ancestral_seqs array of size nptn*nnode for ancestral sequences at all internal nodes
+     */
+    void computeAncestralState(PhyloNeighbor *dad_branch, PhyloNode *dad, int *C, int *ancestral_seqs);
+
     /**
             compute pattern likelihoods only if the accumulated scaling factor is non-zero.
             Otherwise, copy the pattern_lh attribute
@@ -915,11 +1074,13 @@ public:
             @param tree_string tree string to read from
      */
     void readTreeFile(const string &file_name);
-
-    /**
+    
+    /*
             refactored 2015-12-22: Taxon IDs instead of Taxon names to save space!
      * Return the tree string contining taxon IDs and branch lengths
      * @return
+     * @param format (WT_TAXON_ID, WT_BR_LEN, ...)
+     * @return the tree string with the specified format
      */
     virtual string getTreeString();
 
@@ -940,8 +1101,10 @@ public:
 
     /**
      *  Return the sorted topology without branch length, used to compare tree topology
+     *  @param
+     *      printBranchLength true/false
      */
-    string getTopology();
+    string getTopologyString(bool printBranchLength);
 
 
     bool checkEqualScalingFactor(double &sum_scaling, PhyloNode *node = NULL, PhyloNode *dad = NULL);
@@ -951,19 +1114,27 @@ public:
      ****************************************************************************/
 
     //template <const int nstates>
-    void computeLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+//    void computeLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
 
-    //template <const int nstates>
-    void computeMixtureLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+//    void computeSitemodelLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
 
-    //template <const int nstates>
-    void computeMixrateLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+//    template <class VectorClass, const int VCSIZE, const int nstates>
+//    void computeLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
 
-    void computeSitemodelLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+    template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA = false, const bool SITE_MODEL = false>
+    void computeLikelihoodBufferSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, size_t ptn_lower, size_t ptn_upper, int thread_id);
 
-    template <class VectorClass, const int VCSIZE, const int nstates>
-    void computeLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+    template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA = false, const bool SITE_MODEL = false>
+    void computeLikelihoodBufferGenericSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, size_t ptn_lower, size_t ptn_upper, int thread_id);
+
+
+    template <class VectorClass, const bool SAFE_NUMERIC, const int nstates, const bool FMA = false, const bool SITE_MODEL = false>
+    void computeLikelihoodDervSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+
+    template <class VectorClass, const bool SAFE_NUMERIC, const bool FMA = false, const bool SITE_MODEL = false>
+    void computeLikelihoodDervGenericSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
 
+    /*
     template <class VectorClass, const int VCSIZE, const int nstates>
     void computeMixrateLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
 
@@ -972,6 +1143,7 @@ public:
 
     template <class VectorClass, const int VCSIZE, const int nstates>
     void computeSitemodelLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+    */
 
     /**
             compute tree likelihood and derivatives on a branch. used to optimize branch length
@@ -990,6 +1162,9 @@ public:
             Stepwise addition (greedy) by maximum parsimony
      ****************************************************************************/
 
+    /** constraint tree used to guide tree search */
+    ConstraintTree constraintTree;
+
     /**
             FAST VERSION: used internally by computeParsimonyTree() to find the best target branch to add into the tree
             @param added_node node to add
@@ -1118,6 +1293,11 @@ public:
      */
     double* theta_all;
 
+    /** total scaling buffer */
+    double *buffer_scale_all;
+
+    /** buffer used when computing partial_lh, to avoid repeated mem allocation */
+    double *buffer_partial_lh;
 
     /**
      * frequencies of alignment patterns, used as buffer for likelihood computation
@@ -1131,6 +1311,9 @@ public:
      */
     double *ptn_invar;
 
+    vector<TraversalInfo> traversal_info;
+
+
     /****************************************************************************
             Nearest Neighbor Interchange by maximum likelihood
      ****************************************************************************/
@@ -1146,7 +1329,7 @@ public:
             search by a nearest neigbor interchange
             @return the likelihood of the tree
      */
-    double optimizeNNI();
+    //double optimizeNNI();
 
     /**
             search by a nearest neigbor interchange
@@ -1180,12 +1363,20 @@ public:
     virtual void doNNI(NNIMove &move, bool clearLH = true);
 
     /**
+     * [DEPRECATED]
      * Randomly choose perform an NNI, out of the two defined by branch node1-node2.
      * This function also clear the corresponding partial likelihood vectors
-     * @param node1 one node of the branch
-     * @param node2 one node of the branch
+     *
+     * @param branch on which a random NNI is done
      */
-    void doOneRandomNNI(Node *node1, Node *node2);
+    void doOneRandomNNI(Branch branch);
+
+    /**
+    *   Get a random NNI from an internal branch, checking for consistency with constraintTree
+    *   @param branch the internal branch
+    *   @return an NNIMove, node1 and node2 are set to NULL if not consistent with constraintTree
+    */
+    NNIMove getRandomNNI(Branch& branch);
 
 
     /**
@@ -1312,6 +1503,12 @@ public:
     */
     void generateRandomTree(TreeGenType tree_type);
 
+
+    /**
+        test the best number of threads
+    */
+    int testNumThreads();
+
     /****************************************************************************
             Subtree Pruning and Regrafting by maximum likelihood
             NOTE: NOT DONE YET
@@ -1464,13 +1661,20 @@ public:
 
     virtual void changeLikelihoodKernel(LikelihoodKernel lk);
 
-    virtual void setLikelihoodKernel(LikelihoodKernel lk);
+    virtual void setLikelihoodKernel(LikelihoodKernel lk, int num_threads);
 
 #if defined(BINARY32) || defined(__NOAVX__)
     virtual void setLikelihoodKernelAVX() {}
+    virtual void setLikelihoodKernelFMA() {}
 #else
     virtual void setLikelihoodKernelAVX();
+    virtual void setLikelihoodKernelFMA();
+#ifdef INCLUDE_AVX512
+    virtual void setLikelihoodKernelAVX512();
 #endif
+#endif
+    virtual void setLikelihoodKernelSSE();
+    
     /****************************************************************************
             Public variables
      ****************************************************************************/
@@ -1539,11 +1743,6 @@ public:
 
 	double minStateFreq;
 
-    /*
-     * 		Store the all the parameters for the program
-     */
-    Params* params;
-
     /** sequence names that were removed */
 	StrVector removed_seqs;
 
@@ -1582,7 +1781,7 @@ public:
      * compute the memory size required for storing partial likelihood vectors
      * @return memory size required in bytes
      */
-    virtual uint64_t getMemoryRequired(size_t ncategory = 1);
+    virtual uint64_t getMemoryRequired(size_t ncategory = 1, bool full_mem = false);
 
     void getMemoryRequired(uint64_t &partial_lh_entries, uint64_t &scale_num_entries, uint64_t &partial_pars_entries);
 
@@ -1601,10 +1800,7 @@ public:
 
     void approxAllBranches(PhyloNode *node = NULL, PhyloNode *dad = NULL);
 
-    /** set pointer of params variable */
-	virtual void setParams(Params* params);
-
-	double getCurScore() {
+    double getCurScore() {
 		return curScore;
 	}
 
@@ -1622,7 +1818,6 @@ public:
 		    curScore = -DBL_MAX;
         if (model)
             initializeAllPartialLh();
-//		clearAllPartialLH();
 	}
 
     void computeSeqIdentityAlongTree(Split &resp, Node *node = NULL, Node *dad = NULL);
@@ -1630,6 +1825,7 @@ public:
 
     double *getPatternLhCatPointer() { return _pattern_lh_cat; }
 
+
 protected:
 
     /**
@@ -1638,11 +1834,6 @@ protected:
     pllInstance *pllInst;
 
     /**
-     *  Whether the partial likelihood vectors have been computed for PLL
-     */
-//    bool lhComputed;
-
-    /**
      *	PLL data structure for alignment
      */
     pllAlignmentData *pllAlignment;
@@ -1663,7 +1854,7 @@ protected:
     bool subTreeDistComputed;
 
     /**
-     * Map data structure to store distance between subtree.
+     * Map data structure to store distance Candidate trees between subtree.
      * The key is a string which is constructed by concatenating IDs of
      * the 2 nodes, e.g. 15-16
      */
@@ -1691,10 +1882,9 @@ protected:
     double *_pattern_lh;
 
     /**
-            internal pattern likelihoods per category, always stored after calling computeLikelihood()
-            or related functions. Note that scaling factors are not incorporated here.
-            If you want to get real pattern likelihoods, please use computePatternLikelihood()
-     */
+            internal pattern likelihoods per category, 
+            only stored after calling non-SSE computeLikelihood for efficiency purpose
+    */
     double *_pattern_lh_cat;
 
     /**
@@ -1760,6 +1950,16 @@ protected:
      */
     UINT *central_partial_pars;
 
+    void reorientPartialLh(PhyloNeighbor* dad_branch, Node *dad);
+
+    //----------- memory saving technique ------//
+
+    /** maximum number of partial_lh_slots */
+    int64_t max_lh_slots;
+
+    /** mapping from */
+    MemSlotVector mem_slots;
+
     /**
             TRUE to discard saturated for Meyer & von Haeseler (2003) model
      */
@@ -1812,7 +2012,7 @@ protected:
      * Current score of the tree;
      */
     double curScore;
-    
+
     /** current best parsimony score */
     UINT best_pars_score;
 
diff --git a/phylotreeavx.cpp b/phylotreeavx.cpp
index b4d0245..ff39ddf 100644
--- a/phylotreeavx.cpp
+++ b/phylotreeavx.cpp
@@ -6,11 +6,17 @@
  */
 
 
-#include "phylokernel.h"
-#include "phylokernelmixture.h"
-#include "phylokernelmixrate.h"
-#include "phylokernelsitemodel.h"
+#include "vectorclass/vectormath_exp.h"
 #include "vectorclass/vectorclass.h"
+#include "phylokernel.h"
+//#include "phylokernelsafe.h"
+//#include "phylokernelmixture.h"
+//#include "phylokernelmixrate.h"
+//#include "phylokernelsitemodel.h"
+
+#include "phylokernelnew.h"
+#define KERNEL_FIX_STATES
+#include "phylokernelnew.h"
 
 #ifndef __AVX__
 #error "You must compile this file with AVX enabled!"
@@ -23,112 +29,139 @@ void PhyloTree::setParsimonyKernelAVX() {
 
 void PhyloTree::setDotProductAVX() {
 #ifdef BOOT_VAL_FLOAT
-		dotProduct = &PhyloTree::dotProductSIMD<float, Vec8f, 8>;
+		dotProduct = &PhyloTree::dotProductSIMD<float, Vec8f>;
 #else
-		dotProduct = &PhyloTree::dotProductSIMD<double, Vec4d, 4>;
+		dotProduct = &PhyloTree::dotProductSIMD<double, Vec4d>;
 #endif
-
-        dotProductDouble = &PhyloTree::dotProductSIMD<double, Vec4d, 4>;
+        dotProductDouble = &PhyloTree::dotProductSIMD<double, Vec4d>;
 }
 
 void PhyloTree::setLikelihoodKernelAVX() {
+    vector_size = 4;
     setParsimonyKernelAVX();
+
+    if (model_factory && model_factory->model->isSiteSpecificModel() && (params->lk_safe_scaling || leafNum >= params->numseq_safe_scaling)) {
+        switch (aln->num_states) {
+        case 4:
+            computeLikelihoodBranchPointer     = &PhyloTree::computeLikelihoodBranchSIMD    <Vec4d, SAFE_LH, 4, false, true>;
+            computeLikelihoodDervPointer       = &PhyloTree::computeLikelihoodDervSIMD      <Vec4d, SAFE_LH, 4, false, true>;
+            computePartialLikelihoodPointer    =  &PhyloTree::computePartialLikelihoodSIMD  <Vec4d, SAFE_LH, 4, false, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, SAFE_LH, 4, false, true>;
+            break;
+        case 20:
+            computeLikelihoodBranchPointer     = &PhyloTree::computeLikelihoodBranchSIMD    <Vec4d, SAFE_LH, 20, false, true>;
+            computeLikelihoodDervPointer       = &PhyloTree::computeLikelihoodDervSIMD      <Vec4d, SAFE_LH, 20, false, true>;
+            computePartialLikelihoodPointer    = &PhyloTree::computePartialLikelihoodSIMD   <Vec4d, SAFE_LH, 20, false, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, SAFE_LH, 20, false, true>;
+            break;
+        default:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD        <Vec4d, SAFE_LH, false, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD            <Vec4d, SAFE_LH, false, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD      <Vec4d, SAFE_LH, false, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec4d, SAFE_LH, false, true>;
+            break;
+        }
+        return;
+    }
+
     if (model_factory && model_factory->model->isSiteSpecificModel()) {
         switch (aln->num_states) {
         case 4:
-            computeLikelihoodBranchPointer = &PhyloTree::computeSitemodelLikelihoodBranchEigenSIMD<Vec4d, 4, 4>;
-            computeLikelihoodDervPointer = &PhyloTree::computeSitemodelLikelihoodDervEigenSIMD<Vec4d, 4, 4>;
-            computePartialLikelihoodPointer = &PhyloTree::computeSitemodelPartialLikelihoodEigenSIMD<Vec4d, 4, 4>;
-            computeLikelihoodFromBufferPointer = &PhyloTree::computeSitemodelLikelihoodFromBufferEigenSIMD<Vec4d, 4, 4>;
+            computeLikelihoodBranchPointer     = &PhyloTree::computeLikelihoodBranchSIMD    <Vec4d, NORM_LH, 4, false, true>;
+            computeLikelihoodDervPointer       = &PhyloTree::computeLikelihoodDervSIMD      <Vec4d, NORM_LH, 4, false, true>;
+            computePartialLikelihoodPointer    =  &PhyloTree::computePartialLikelihoodSIMD  <Vec4d, NORM_LH, 4, false, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, NORM_LH, 4, false, true>;
             break;
         case 20:
-            computeLikelihoodBranchPointer = &PhyloTree::computeSitemodelLikelihoodBranchEigenSIMD<Vec4d, 4, 20>;
-            computeLikelihoodDervPointer = &PhyloTree::computeSitemodelLikelihoodDervEigenSIMD<Vec4d, 4, 20>;
-            computePartialLikelihoodPointer = &PhyloTree::computeSitemodelPartialLikelihoodEigenSIMD<Vec4d, 4, 20>;
-            computeLikelihoodFromBufferPointer = &PhyloTree::computeSitemodelLikelihoodFromBufferEigenSIMD<Vec4d, 4, 20>;
+            computeLikelihoodBranchPointer     = &PhyloTree::computeLikelihoodBranchSIMD    <Vec4d, NORM_LH, 20, false, true>;
+            computeLikelihoodDervPointer       = &PhyloTree::computeLikelihoodDervSIMD      <Vec4d, NORM_LH, 20, false, true>;
+            computePartialLikelihoodPointer    = &PhyloTree::computePartialLikelihoodSIMD   <Vec4d, NORM_LH, 20, false, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, NORM_LH, 20, false, true>;
             break;
         default:
-            computeLikelihoodBranchPointer = &PhyloTree::computeSitemodelLikelihoodBranchEigen;
-            computeLikelihoodDervPointer = &PhyloTree::computeSitemodelLikelihoodDervEigen;
-            computePartialLikelihoodPointer = &PhyloTree::computeSitemodelPartialLikelihoodEigen;
-            computeLikelihoodFromBufferPointer = &PhyloTree::computeSitemodelLikelihoodFromBufferEigen;
-            break;        
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD        <Vec4d, NORM_LH, false, true>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD            <Vec4d, NORM_LH, false, true>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD      <Vec4d, NORM_LH, false, true>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec4d, NORM_LH, false, true>;
+            break;
         }
         return;
     }
 
+    if (params->lk_safe_scaling || leafNum >= params->numseq_safe_scaling) {
 	switch(aln->num_states) {
+        /*
+        case 2:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, SAFE_LH, 2>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, SAFE_LH, 2>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, SAFE_LH, 2>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, SAFE_LH, 2>;
+            break;
+        */
+        case 4:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, SAFE_LH, 4>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, SAFE_LH, 4>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, SAFE_LH, 4>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, SAFE_LH, 4>;
+            break;
+        case 20:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, SAFE_LH, 20>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, SAFE_LH, 20>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, SAFE_LH, 20>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, SAFE_LH, 20>;
+            break;
+        /*
+        case 64:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, SAFE_LH, 64>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, SAFE_LH, 64>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, SAFE_LH, 64>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, SAFE_LH, 64>;
+            break;
+        */
+        default:
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD<Vec4d, SAFE_LH>;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD<Vec4d, SAFE_LH>;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD<Vec4d, SAFE_LH>;
+            computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec4d, SAFE_LH>;
+            break;
+        }
+        return;
+    }
+
+	switch(aln->num_states) {
+    /*
+	case 2:
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, NORM_LH, 2>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, NORM_LH, 2>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, NORM_LH, 2>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, NORM_LH, 2>;
+		break;
+    */
 	case 4:
-		if (model_factory && model_factory->model->isMixture()) {
-			if (model_factory->fused_mix_rate) {
-				computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigenSIMD<Vec4d, 4, 4>;
-				computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigenSIMD<Vec4d, 4, 4>;
-				computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigenSIMD<Vec4d, 4, 4>;
-				computeLikelihoodFromBufferPointer = &PhyloTree::computeMixrateLikelihoodFromBufferEigenSIMD<Vec4d, 4, 4>;
-//		        cout << "Fast-AVX-semi-mixture" << endl;
-			} else {
-				computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigenSIMD<Vec4d, 4, 4>;
-				computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigenSIMD<Vec4d, 4, 4>;
-				computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigenSIMD<Vec4d, 4, 4>;
-				computeLikelihoodFromBufferPointer = &PhyloTree::computeMixtureLikelihoodFromBufferEigenSIMD<Vec4d, 4, 4>;
-//		        cout << "Fast-AVX-mixture" << endl;
-			}
-		} else {
-			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec4d, 4, 4>;
-			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec4d, 4, 4>;
-			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigenSIMD<Vec4d, 4, 4>;
-			computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferEigenSIMD<Vec4d, 4, 4>;
-//	        cout << "Fast-AVX" << endl;
-		}
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, NORM_LH, 4>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, NORM_LH, 4>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, NORM_LH, 4>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, NORM_LH, 4>;
 		break;
 	case 20:
-		if (model_factory && model_factory->model->isMixture()) {
-			if (model_factory->fused_mix_rate) {
-				computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigenSIMD<Vec4d, 4, 20>;
-				computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigenSIMD<Vec4d, 4, 20>;
-				computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigenSIMD<Vec4d, 4, 20>;
-				computeLikelihoodFromBufferPointer = &PhyloTree::computeMixrateLikelihoodFromBufferEigenSIMD<Vec4d, 4, 20>;
-//		        cout << "Fast-AVX-semi-mixture" << endl;
-			} else {
-				computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigenSIMD<Vec4d, 4, 20>;
-				computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigenSIMD<Vec4d, 4, 20>;
-				computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigenSIMD<Vec4d, 4, 20>;
-				computeLikelihoodFromBufferPointer = &PhyloTree::computeMixtureLikelihoodFromBufferEigenSIMD<Vec4d, 4, 20>;
-//		        cout << "Fast-AVX-mixture" << endl;
-			}
-		} else {
-			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec4d, 4, 20>;
-			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec4d, 4, 20>;
-			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigenSIMD<Vec4d, 4, 20>;
-			computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferEigenSIMD<Vec4d, 4, 20>;
-//	        cout << "Fast-AVX" << endl;
-		}
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, NORM_LH, 20>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, NORM_LH, 20>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, NORM_LH, 20>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, NORM_LH, 20>;
 		break;
+    /*
 	case 64:
-		if (model_factory && model_factory->model->isMixture()) {
-			if (model_factory->fused_mix_rate) {
-				computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigenSIMD<Vec4d, 4, 64>;
-				computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigenSIMD<Vec4d, 4, 64>;
-				computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigenSIMD<Vec4d, 4, 64>;
-				computeLikelihoodFromBufferPointer = &PhyloTree::computeMixrateLikelihoodFromBufferEigenSIMD<Vec4d, 4, 64>;
-//		        cout << "Fast-AVX-semi-mixture" << endl;
-			} else {
-				computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigenSIMD<Vec4d, 4, 64>;
-				computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigenSIMD<Vec4d, 4, 64>;
-				computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigenSIMD<Vec4d, 4, 64>;
-				computeLikelihoodFromBufferPointer = &PhyloTree::computeMixtureLikelihoodFromBufferEigenSIMD<Vec4d, 4, 64>;
-//		        cout << "Fast-AVX-mixture" << endl;
-			}
-		} else {
-			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec4d, 4, 64>;
-			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec4d, 4, 64>;
-			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigenSIMD<Vec4d, 4, 64>;
-			computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferEigenSIMD<Vec4d, 4, 64>;
-//	        cout << "Fast-AVX" << endl;
-		}
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSIMD<Vec4d, NORM_LH, 64>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSIMD<Vec4d, NORM_LH, 64>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSIMD<Vec4d, NORM_LH, 64>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferSIMD<Vec4d, NORM_LH, 64>;
 		break;
+    */
 	default:
-		assert(0);
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD<Vec4d, NORM_LH>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD<Vec4d, NORM_LH>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD<Vec4d, NORM_LH>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec4d, NORM_LH>;
 		break;
 	}
 }
diff --git a/phylotreepars.cpp b/phylotreepars.cpp
index 57b3ae4..0cea097 100644
--- a/phylotreepars.cpp
+++ b/phylotreepars.cpp
@@ -8,9 +8,23 @@
  */
 
 #include "phylotree.h"
-#include "vectorclass/vectorclass.h"
+//#include "vectorclass/vectorclass.h"
 #include "phylosupertree.h"
 
+#if defined (__GNUC__) || defined(__clang__)
+#define vml_popcnt __builtin_popcount
+#else
+// taken from vectorclass library
+static inline uint32_t vml_popcnt (uint32_t a) {	
+    // popcnt instruction not available
+    uint32_t b = a - ((a >> 1) & 0x55555555);
+    uint32_t c = (b & 0x33333333) + ((b >> 2) & 0x33333333);
+    uint32_t d = (c + (c >> 4)) & 0x0F0F0F0F;
+    uint32_t e = d * 0x01010101;
+    return   e >> 24;
+}
+#endif
+
 /***********************************************************/
 /****** optimized version of parsimony kernel **************/
 /***********************************************************/
@@ -19,132 +33,128 @@ void PhyloTree::computePartialParsimonyFast(PhyloNeighbor *dad_branch, PhyloNode
     if (dad_branch->partial_lh_computed & 2)
         return;
     Node *node = dad_branch->node;
-    int nstates = aln->num_states;
-    int site;
+    int nstates = aln->getMaxNumStates();
+    int site = 0;
 
     dad_branch->partial_lh_computed |= 2;
 
+    vector<Alignment*> *partitions = NULL;
+    if (aln->isSuperAlignment())
+        partitions = &((SuperAlignment*)aln)->partitions;
+    else {
+        partitions = new vector<Alignment*>;
+        partitions->push_back(aln);
+    }
+
     if (node->isLeaf() && dad) {
         // external node
-        if (aln->ordered_pattern.empty())
-            aln->orderPatternByNumChars();
         int leafid = node->id;
-        int pars_size = getBitsBlockSize();
-        memset(dad_branch->partial_pars, 0, pars_size*sizeof(UINT));
-//        int ptn;
-//        int nptn = aln->size();
-    	int ambi_aa[] = {2, 3, 5, 6, 9, 10}; // {4+8, 32+64, 512+1024};
+        memset(dad_branch->partial_pars, 0, getBitsBlockSize()*sizeof(UINT));
         int max_sites = ((aln->num_informative_sites+UINT_BITS-1)/UINT_BITS)*UINT_BITS;
-        Alignment::iterator pat;
-    	switch (aln->seq_type) {
-    	case SEQ_DNA:
-//            nptn = aln->ordered_pattern.size();
-            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
-//                Pattern *pat = &aln->ordered_pattern[ptn];
-//                if (!pat->is_informative)
-//                    continue;
-            	int state = pat->at(leafid);
-                int freq = pat->frequency;
-                if (state < 4) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        dad_branch->partial_pars[(site/UINT_BITS)*4+state] |= (1 << (site % UINT_BITS));
-                    }
-                } else if (state == aln->STATE_UNKNOWN) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*4);
-                        UINT bit1 = (1 << (site%UINT_BITS));
-                        p[0] |= bit1;
-                        p[1] |= bit1;
-                        p[2] |= bit1;
-                        p[3] |= bit1;
-                    }
-                } else {
-                	state -= 3;
-                    for (int j = 0; j < freq; j++, site++) {
-                        UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*4);
-                        UINT bit1 = (1 << (site%UINT_BITS));
-                        for (int i = 0; i < 4; i++)
-                            if (state & (1<<i))
-                                p[i] |= bit1;
+        int ambi_aa[] = {2, 3, 5, 6, 9, 10}; // {4+8, 32+64, 512+1024};
+        if (aln->ordered_pattern.empty())
+            aln->orderPatternByNumChars();
+        int start_pos = 0;
+        for (vector<Alignment*>::iterator alnit = partitions->begin(); alnit != partitions->end(); alnit++) {
+            int end_pos = start_pos + (*alnit)->ordered_pattern.size();
+            switch ((*alnit)->seq_type) {
+            case SEQ_DNA:
+                for (int patid = start_pos; patid != end_pos; patid++) {
+                    Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
+                    int state = pat->at(leafid);
+                    int freq = pat->frequency;
+                    if (state < 4) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            dad_branch->partial_pars[(site/UINT_BITS)*nstates+state] |= (1 << (site % UINT_BITS));
+                        }
+                    } else if (state == (*alnit)->STATE_UNKNOWN) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
+                            UINT bit1 = (1 << (site%UINT_BITS));
+                            p[0] |= bit1;
+                            p[1] |= bit1;
+                            p[2] |= bit1;
+                            p[3] |= bit1;
+                        }
+                    } else {
+                        state -= 3;
+                        assert(state < 15);
+                        for (int j = 0; j < freq; j++, site++) {
+                            UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
+                            UINT bit1 = (1 << (site%UINT_BITS));
+                            for (int i = 0; i < 4; i++)
+                                if (state & (1<<i))
+                                    p[i] |= bit1;
+                        }
                     }
                 }
-            }
-            assert(site == aln->num_informative_sites);
-            // add dummy states
-            if (site < max_sites)
-            	dad_branch->partial_pars[(site/UINT_BITS)*4] |= ~((1<<(site%UINT_BITS)) - 1);
-//            for (; site < max_sites; site++) {
-//                dad_branch->partial_pars[(site/UINT_BITS)*4] |= (1 << (site%UINT_BITS));
-//            }
-    		break;
-    	case SEQ_PROTEIN:
-            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
-//                if (!aln->at(ptn).is_informative)
-//                    continue;
-            	int state = pat->at(leafid);
-                int freq = pat->frequency;
-                if (state < 20) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        dad_branch->partial_pars[(site/UINT_BITS)*20+state] |= (1 << (site % UINT_BITS));
-                    }
-                } else if (state == aln->STATE_UNKNOWN) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*20);
-                        UINT bit1 = (1 << (site%UINT_BITS));
-                        for (int i = 0; i < 20; i++)
-                                p[i] |= bit1;
-                    }
-                } else {
-                	assert(state < 23);
-            		state = (state-20)*2;
-                    for (int j = 0; j < freq; j++, site++) {
-                        UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*20);
-                        UINT bit1 = (1 << (site%UINT_BITS));
-                        p[ambi_aa[state]] |= bit1;
-                        p[ambi_aa[state+1]] |= bit1;
+                //assert(site == aln->num_informative_sites);
+                // add dummy states
+                //if (site < max_sites)
+                //    dad_branch->partial_pars[(site/UINT_BITS)*4] |= ~((1<<(site%UINT_BITS)) - 1);
+                break;
+            case SEQ_PROTEIN:
+                for (int patid = start_pos; patid != end_pos; patid++) {
+                    Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
+                    int state = pat->at(leafid);
+                    int freq = pat->frequency;
+                    if (state < 20) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            dad_branch->partial_pars[(site/UINT_BITS)*nstates+state] |= (1 << (site % UINT_BITS));
+                        }
+                    } else if (state == (*alnit)->STATE_UNKNOWN) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
+                            UINT bit1 = (1 << (site%UINT_BITS));
+                            for (int i = 0; i < 20; i++)
+                                    p[i] |= bit1;
+                        }
+                    } else {
+                        assert(state < 23);
+                        state = (state-20)*2;
+                        for (int j = 0; j < freq; j++, site++) {
+                            UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
+                            UINT bit1 = (1 << (site%UINT_BITS));
+                            p[ambi_aa[state]] |= bit1;
+                            p[ambi_aa[state+1]] |= bit1;
+                        }
                     }
                 }
-            }
-            assert(site == aln->num_informative_sites);
-            // add dummy states
-            if (site < max_sites)
-            	dad_branch->partial_pars[(site/UINT_BITS)*20] |= ~((1<<(site%UINT_BITS)) - 1);
-//            for (; site < max_sites; site++) {
-//                dad_branch->partial_pars[(site/UINT_BITS)*20] |= (1 << (site%UINT_BITS));
-//            }
-    		break;
-    	default:
-//            for (ptn = 0, site = 0; ptn < nptn; ptn++) {
-            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
-//                if (!aln->at(ptn).is_informative)
-//                    continue;
-            	int state = pat->at(leafid);
-                int freq = pat->frequency;
-                if (state < nstates) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        dad_branch->partial_pars[(site/UINT_BITS)*nstates+state] |= (1 << (site % UINT_BITS));
-                    }
-                } else if (state == aln->STATE_UNKNOWN) {
-                    for (int j = 0; j < freq; j++, site++) {
-                        UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
-                        UINT bit1 = (1 << (site%UINT_BITS));
-                        for (int i = 0; i < nstates; i++)
-                                p[i] |= bit1;
+                //assert(site == aln->num_informative_sites);
+                // add dummy states
+                //if (site < max_sites)
+                //    dad_branch->partial_pars[(site/UINT_BITS)*20] |= ~((1<<(site%UINT_BITS)) - 1);
+                break;
+            default:
+                for (int patid = start_pos; patid != end_pos; patid++) {
+                    Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
+                    int state = pat->at(leafid);
+                    int freq = pat->frequency;
+                    if (state < (*alnit)->num_states) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            dad_branch->partial_pars[(site/UINT_BITS)*nstates+state] |= (1 << (site % UINT_BITS));
+                        }
+                    } else if (state == (*alnit)->STATE_UNKNOWN) {
+                        for (int j = 0; j < freq; j++, site++) {
+                            UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
+                            UINT bit1 = (1 << (site%UINT_BITS));
+                            for (int i = 0; i < (*alnit)->num_states; i++)
+                                    p[i] |= bit1;
+                        }
+                    } else {
+                        assert(0);
                     }
-                } else {
-                	assert(0);
                 }
-            }
-            assert(site == aln->num_informative_sites);
-            // add dummy states
-            if (site < max_sites)
-            	dad_branch->partial_pars[(site/UINT_BITS)*nstates] |= ~((1<<(site%UINT_BITS)) - 1);
-//            for (; site < max_sites; site++) {
-//                dad_branch->partial_pars[(site/UINT_BITS)*nstates] |= (1 << (site%UINT_BITS));
-//            }
-    		break;
-    	}
+                break;
+            } // end of switch
+            
+            start_pos = end_pos;
+        } // FOR LOOP
 
+        assert(site == aln->num_informative_sites);
+        // add dummy states
+        if (site < max_sites)
+            dad_branch->partial_pars[(site/UINT_BITS)*nstates] |= ~((1<<(site%UINT_BITS)) - 1);
     } else {
         // internal node
         assert(node->degree() == 3); // it works only for strictly bifurcating tree
@@ -168,7 +178,7 @@ void PhyloTree::computePartialParsimonyFast(PhyloNeighbor *dad_branch, PhyloNode
             #endif
 			for (site = 0; site<nsites; site++) {
 				UINT w;
-                size_t offset = 4*site;
+                size_t offset = nstates*site;
                 UINT *x = left->partial_pars + offset;
                 UINT *y = right->partial_pars + offset;
                 UINT *z = dad_branch->partial_pars + offset;
@@ -178,7 +188,7 @@ void PhyloTree::computePartialParsimonyFast(PhyloNeighbor *dad_branch, PhyloNode
 				z[3] = x[3] & y[3];
 				w = z[0] | z[1] | z[2] | z[3];
 				w = ~w;
-				score += vml_popcnt(w);
+				score += __builtin_popcount(w);
 				z[0] |= w & (x[0] | y[0]);
 				z[1] |= w & (x[1] | y[1]);
 				z[2] |= w & (x[2] | y[2]);
@@ -212,6 +222,9 @@ void PhyloTree::computePartialParsimonyFast(PhyloNeighbor *dad_branch, PhyloNode
         dad_branch->partial_pars[nstates*nsites] = score + left->partial_pars[nstates*nsites] + right->partial_pars[nstates*nsites];
 //        dad_branch->partial_pars[0] = score;
     }
+    
+    if (!aln->isSuperAlignment())
+        delete partitions;
 }
 
 
@@ -227,7 +240,7 @@ int PhyloTree::computeParsimonyBranchFast(PhyloNeighbor *dad_branch, PhyloNode *
         computePartialParsimonyFast(node_branch, node);
     int site;
     int nsites = (aln->num_informative_sites + UINT_BITS-1) / UINT_BITS;
-    int nstates = aln->num_states;
+    int nstates = aln->getMaxNumStates();
 
     int scoreid = ((aln->num_informative_sites+UINT_BITS-1)/UINT_BITS)*nstates;
     UINT sum_end_node = (dad_branch->partial_pars[scoreid] + node_branch->partial_pars[scoreid]);
@@ -247,10 +260,10 @@ int PhyloTree::computeParsimonyBranchFast(PhyloNeighbor *dad_branch, PhyloNode *
 			UINT w = (x[0] & y[0]) | (x[1] & y[1]) | (x[2] & y[2]) | (x[3] & y[3]);
 			w = ~w;
 			score += vml_popcnt(w);
-            #ifndef _OPENMP
-            if (score >= lower_bound)
-                break;
-            #endif
+//            #ifndef _OPENMP
+//            if (score >= lower_bound)
+//                break;
+//            #endif
 		}
 		break;
     default:
@@ -268,10 +281,10 @@ int PhyloTree::computeParsimonyBranchFast(PhyloNeighbor *dad_branch, PhyloNode *
 			}
 			w = ~w;
 			score += vml_popcnt(w);
-            #ifndef _OPENMP
-            if (score >= lower_bound)
-                break;
-            #endif
+//            #ifndef _OPENMP
+//            if (score >= lower_bound)
+//                break;
+//            #endif
 		}
 		break;
     }
@@ -312,40 +325,78 @@ int PhyloTree::computeParsimonyTree(const char *out_prefix, Alignment *alignment
     if (size < 3)
         outError(ERR_FEW_TAXA);
 
-    freeNode();
+    IntVector taxon_order;
+    taxon_order.reserve(size);
+
+    if (constraintTree.empty()) {
+        freeNode();
+        taxon_order.resize(size);
+        for (int i = 0; i < size; i++)
+            taxon_order[i] = i;
+        // randomize the addition order
+        my_random_shuffle(taxon_order.begin(), taxon_order.end());
+
+        root = newNode(size);
+
+        // create initial tree with 3 taxa
+        for (leafNum = 0; leafNum < 3; leafNum++) {
+            if (verbose_mode >= VB_MAX)
+                cout << "Add " << aln->getSeqName(taxon_order[leafNum]) << " to the tree" << endl;
+            Node *new_taxon = newNode(taxon_order[leafNum], aln->getSeqName(taxon_order[leafNum]).c_str());
+            root->addNeighbor(new_taxon, -1.0);
+            new_taxon->addNeighbor(root, -1.0);
+        }
+    } else {
+        // first copy the constraint tree
+        MTree::copyTree(&constraintTree);
+        
+        // convert to birfucating tree if needed
+        extractBifurcatingSubTree();
+        assert(isBifurcating());
+        
+        // assign proper taxon IDs
+        NodeVector nodes;
+        NodeVector::iterator it;
+        getTaxa(nodes);
+        leafNum = nodes.size();
+        vector<int> pushed;
+        pushed.resize(size, 0);
+        for (it = nodes.begin(); it != nodes.end(); it++) {
+            (*it)->id = aln->getSeqID((*it)->name);
+            assert((*it)->id >= 0);
+            taxon_order.push_back((*it)->id);
+            pushed[(*it)->id] = 1;
+        }
 
-    root = newNode(size);
+        // start with constraint tree
+        int i;
+        for (i = 0; i < size; i++)
+            if (!pushed[i] && constraintTree.hasTaxon(aln->getSeqName(i))) {
+                taxon_order.push_back(i);
+                pushed[i] = 1;
+            }
+        assert(taxon_order.size() == constraintTree.leafNum);
+        for (int i = 0; i < size; i++)
+            if (!pushed[i]) {
+                taxon_order.push_back(i);
+            }
+        // randomize the addition order
+        my_random_shuffle(taxon_order.begin()+leafNum, taxon_order.begin()+constraintTree.leafNum);
+        my_random_shuffle(taxon_order.begin()+constraintTree.leafNum, taxon_order.end());
 
-    IntVector taxon_order;
-    taxon_order.resize(size);
-    for (int i = 0; i < size; i++)
-        taxon_order[i] = i;
-    // randomize the addition order
-    my_random_shuffle(taxon_order.begin(), taxon_order.end());
-
-    // create initial tree with 3 taxa
-    for (leafNum = 0; leafNum < 3; leafNum++) {
-        if (verbose_mode >= VB_MAX)
-            cout << "Add " << aln->getSeqName(taxon_order[leafNum]) << " to the tree" << endl;
-        Node *new_taxon = newNode(taxon_order[leafNum], aln->getSeqName(taxon_order[leafNum]).c_str());
-        root->addNeighbor(new_taxon, -1.0);
-        new_taxon->addNeighbor(root, -1.0);
     }
     root = findNodeID(taxon_order[0]);
     initializeAllPartialPars();
-    size_t index = 6;
+    size_t index = (2*leafNum-3)*2;
     size_t pars_block_size = getBitsBlockSize();
 
-    if (isSuperTree())
-        ((PhyloSuperTree*)this)->mapTrees();
-    
     UINT *tmp_partial_pars;
     tmp_partial_pars = newBitsBlock();
 
-    // stepwise adding the next taxon
-    for (leafNum = 3; leafNum < size; leafNum++) {
+    // stepwise adding the next taxon for the remaining taxa
+    for (; leafNum < size; leafNum++) {
         if (verbose_mode >= VB_MAX)
-            cout << "Add " << aln->getSeqName(taxon_order[leafNum]) << " to the tree";
+            cout << "Adding " << aln->getSeqName(taxon_order[leafNum]) << " to the tree..." << endl;
         NodeVector nodes1, nodes2;
         getBranches(nodes1, nodes2);
         PhyloNode *target_node = NULL;
@@ -367,6 +418,7 @@ int PhyloTree::computeParsimonyTree(const char *out_prefix, Alignment *alignment
         added_node->addNeighbor((Node*) 2, -1.0);
 
         for (int nodeid = 0; nodeid < nodes1.size(); nodeid++) {
+        
             int score = addTaxonMPFast(new_taxon, added_node, nodes1[nodeid], nodes2[nodeid]);
             if (score < best_pars_score) {
                 best_pars_score = score;
@@ -422,6 +474,9 @@ int PhyloTree::computeParsimonyTree(const char *out_prefix, Alignment *alignment
 		file_name += ".parstree";
 		printTree(file_name.c_str(), WT_NEWLINE);
     }
+//    if (isSuperTree())
+//        ((PhyloSuperTree*)this)->mapTrees();
+    
     return best_pars_score;
 }
 
@@ -445,6 +500,11 @@ int PhyloTree::addTaxonMPFast(Node *added_taxon, Node* added_node, Node* node, N
     // compute the likelihood
     ((PhyloNeighbor*) added_taxon->findNeighbor(added_node))->clearPartialLh();
     int score = computeParsimonyBranch((PhyloNeighbor*) added_node->neighbors[0], (PhyloNode*) added_node);
+    if (leafNum < constraintTree.leafNum) {
+        // still during addition of taxa from constraint tree
+        if (!constraintTree.isCompatible(this))
+            score = INT_MAX;
+    }
     // remove the added node
     node->updateNeighbor(added_node, dad, len);
     dad->updateNeighbor(added_node, node, len);
diff --git a/phylotreesse.cpp b/phylotreesse.cpp
index 7782612..b0f9c01 100644
--- a/phylotreesse.cpp
+++ b/phylotreesse.cpp
@@ -18,14 +18,23 @@
  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
  ***************************************************************************/
 #include "phylotree.h"
-#include "phylokernel.h"
-#include "phylokernelmixture.h"
-#include "phylokernelmixrate.h"
-#include "phylokernelsitemodel.h"
+#include "vectorclass/instrset.h"
+
+#if INSTRSET < 2
+#include "phylokernelnew.h"
+#define KERNEL_FIX_STATES
+#include "phylokernelnew.h"
+#include "vectorf64.h"
+#endif
+
+//#include "phylokernel.h"
+//#include "phylokernelmixture.h"
+//#include "phylokernelmixrate.h"
+//#include "phylokernelsitemodel.h"
+
 #include "model/modelgtr.h"
 #include "model/modelset.h"
 
-
 /* BQM: to ignore all-gapp subtree at an alignment site */
 //#define IGNORE_GAP_LH
 
@@ -33,297 +42,118 @@
 
 void PhyloTree::setParsimonyKernel(LikelihoodKernel lk) {
     // set parsimony kernel
-    switch (lk) {
-//    case LK_SSE:
-//        computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchNaive;
-//        computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyNaive;
-//    	break;
-    case LK_EIGEN:
+    if (lk == LK_EIGEN || instruction_set < 2) {
         computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchFast;
         computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyFast;
-    	break;
-    case LK_EIGEN_SSE:
-		if (instruction_set >= 7)
-			setParsimonyKernelAVX();
-		else {
-			computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchFastSIMD<Vec4ui>;
-            computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyFastSIMD<Vec4ui>;
-        }
-    	break;
-//    default:
-//        computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchNaive;
-//        computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyNaive;
-//    	break;
+    	return;
+    }
+    if (instruction_set >= 7) {
+        setParsimonyKernelAVX();
+        return;
     }
+    if (instruction_set >= 2) {
+        setParsimonyKernelSSE();
+        return;
+    }
+    assert(0);
 }
 
-void PhyloTree::setLikelihoodKernel(LikelihoodKernel lk) {
+void PhyloTree::setLikelihoodKernel(LikelihoodKernel lk, int num_threads) {
+
+	sse = lk;
+    vector_size = 1;
+    this->num_threads = num_threads;
+
+    //--- parsimony kernel ---
     setParsimonyKernel(lk);
 
-	if (instruction_set >= 7) {
+    bool has_fma = (hasFMA3()) && (instruction_set >= 7) && (Params::getInstance().lk_no_avx != 2);
+    //--- dot-product kernel ---
+    if (has_fma) {
+		setDotProductFMA();
+	} else if (instruction_set >= 7) {
 		setDotProductAVX();
+    } else if (instruction_set >= 2) {
+        setDotProductSSE();
 	} else {
+
+#if INSTRSET < 2
 #ifdef BOOT_VAL_FLOAT
-		dotProduct = &PhyloTree::dotProductSIMD<float, Vec4f, 4>;
+        // TODO naive dot-product for float
+        assert(0 && "Not supported, contact developer");
+//		dotProduct = &PhyloTree::dotProductSIMD<float, Vec1f>;
 #else
-		dotProduct = &PhyloTree::dotProductSIMD<double, Vec2d, 2>;
+		dotProduct = &PhyloTree::dotProductSIMD<double, Vec1d>;
+#endif
+        dotProductDouble = &PhyloTree::dotProductSIMD<double, Vec1d>;
 #endif
-		dotProductDouble = &PhyloTree::dotProductSIMD<double, Vec2d, 2>;
 	}
-	sse = lk;
+
+    //--- naive likelihood kernel, no alignment specified yet ---
     if (!aln) {
-        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigen;
-        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigen;
-        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigen;
+#if INSTRSET < 2
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD<Vec1d, SAFE_LH>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD<Vec1d, SAFE_LH>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD<Vec1d, SAFE_LH>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec1d, SAFE_LH>;
+        sse = LK_EIGEN;
+#else
+        computeLikelihoodBranchPointer = NULL;
+        computeLikelihoodDervPointer = NULL;
+        computePartialLikelihoodPointer = NULL;
         computeLikelihoodFromBufferPointer = NULL;
         sse = LK_EIGEN;
+#endif
         return;
     }
-    
-    if (model_factory && model_factory->model->isSiteSpecificModel()) {
-        if (sse == LK_EIGEN) {
-            computeLikelihoodBranchPointer = &PhyloTree::computeSitemodelLikelihoodBranchEigen;
-            computeLikelihoodDervPointer = &PhyloTree::computeSitemodelLikelihoodDervEigen;
-            computePartialLikelihoodPointer = &PhyloTree::computeSitemodelPartialLikelihoodEigen;
-            computeLikelihoodFromBufferPointer = &PhyloTree::computeSitemodelLikelihoodFromBufferEigen;
-            return;        
-        }
-        // LK_EIGEN_SSE
-        if (instruction_set >= 7) {
+
+    //--- SIMD kernel ---
+    if (sse == LK_EIGEN_SSE && instruction_set >= 2) {
+#ifdef INCLUDE_AVX512
+    	if (instruction_set >= 9) {
+    		setLikelihoodKernelAVX512();
+    		return;
+    	}
+#endif
+    	if (has_fma) {
+            // CPU supports AVX and FMA
+            setLikelihoodKernelFMA();
+        } else if (instruction_set >= 7) {
             // CPU supports AVX
             setLikelihoodKernelAVX();
-            return;
-        }
-        switch (aln->num_states) {
-        case 4:
-            computeLikelihoodBranchPointer = &PhyloTree::computeSitemodelLikelihoodBranchEigenSIMD<Vec2d, 2, 4>;
-            computeLikelihoodDervPointer = &PhyloTree::computeSitemodelLikelihoodDervEigenSIMD<Vec2d, 2, 4>;
-            computePartialLikelihoodPointer = &PhyloTree::computeSitemodelPartialLikelihoodEigenSIMD<Vec2d, 2, 4>;
-            computeLikelihoodFromBufferPointer = &PhyloTree::computeSitemodelLikelihoodFromBufferEigenSIMD<Vec2d, 2, 4>;
-            break;
-        case 20:
-            computeLikelihoodBranchPointer = &PhyloTree::computeSitemodelLikelihoodBranchEigenSIMD<Vec2d, 2, 20>;
-            computeLikelihoodDervPointer = &PhyloTree::computeSitemodelLikelihoodDervEigenSIMD<Vec2d, 2, 20>;
-            computePartialLikelihoodPointer = &PhyloTree::computeSitemodelPartialLikelihoodEigenSIMD<Vec2d, 2, 20>;
-            computeLikelihoodFromBufferPointer = &PhyloTree::computeSitemodelLikelihoodFromBufferEigenSIMD<Vec2d, 2, 20>;
-            break;
-        default:
-            computeLikelihoodBranchPointer = &PhyloTree::computeSitemodelLikelihoodBranchEigen;
-            computeLikelihoodDervPointer = &PhyloTree::computeSitemodelLikelihoodDervEigen;
-            computePartialLikelihoodPointer = &PhyloTree::computeSitemodelPartialLikelihoodEigen;
-            computeLikelihoodFromBufferPointer = &PhyloTree::computeSitemodelLikelihoodFromBufferEigen;
-            break;
-        }
-        return;
-    }
-    
-    if (sse == LK_EIGEN) {
-        if (model_factory && model_factory->model->isMixture()) {
-            if (model_factory->fused_mix_rate) {
-                computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigen;
-                computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigen;
-                computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigen;
-                computeLikelihoodFromBufferPointer = NULL;
-            } else {
-                computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigen;
-                computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigen;
-                computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigen;
-                computeLikelihoodFromBufferPointer = NULL;
-            }
         } else {
-            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigen;
-            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigen;
-            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigen;
-            computeLikelihoodFromBufferPointer = NULL;
+            // SSE kernel
+            setLikelihoodKernelSSE();
         }
         return;
     }
 
-//    cout << "Likelihood kernel: ";
-        
-    // set likelihood kernel
-	switch(aln->num_states) {
-	case 4:
-		switch(sse) {
-//		case LK_SSE:
-//			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<4>;
-//			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<4>;
-//			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<4>;
-//	        computeLikelihoodFromBufferPointer = NULL;
-//			break;
-		case LK_EIGEN_SSE:
-			if (instruction_set >= 7) {
-				// CPU supports AVX
-				setLikelihoodKernelAVX();
-			} else {
-				// CPU does not support AVX
-				if (model_factory && model_factory->model->isMixture()) {
-					if (model_factory->fused_mix_rate) {
-						computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigenSIMD<Vec2d, 2, 4>;
-						computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigenSIMD<Vec2d, 2, 4>;
-						computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigenSIMD<Vec2d, 2, 4>;
-						computeLikelihoodFromBufferPointer = &PhyloTree::computeMixrateLikelihoodFromBufferEigenSIMD<Vec2d, 2, 4>;
-					} else {
-						computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigenSIMD<Vec2d, 2, 4>;
-						computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigenSIMD<Vec2d, 2, 4>;
-						computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigenSIMD<Vec2d, 2, 4>;
-						computeLikelihoodFromBufferPointer = &PhyloTree::computeMixtureLikelihoodFromBufferEigenSIMD<Vec2d, 2, 4>;
-					}
-				} else {
-					computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec2d, 2, 4>;
-					computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec2d, 2, 4>;
-					computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigenSIMD<Vec2d, 2, 4>;
-					computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferEigenSIMD<Vec2d, 2, 4>;
-				}
-			}
-			break;
-		default:
-			break;
-		}
-		break;
-	case 20:
-		switch(sse) {
-//		case LK_SSE:
-//			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<20>;
-//			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<20>;
-//			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<20>;
-//	        computeLikelihoodFromBufferPointer = NULL;
-//			break;
-		case LK_EIGEN_SSE:
-			if (instruction_set >= 7) {
-				setLikelihoodKernelAVX();
-			} else {
-				if (model_factory && model_factory->model->isMixture()) {
-					if (model_factory->fused_mix_rate) {
-						computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigenSIMD<Vec2d, 2, 20>;
-						computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigenSIMD<Vec2d, 2, 20>;
-						computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigenSIMD<Vec2d, 2, 20>;
-						computeLikelihoodFromBufferPointer = &PhyloTree::computeMixrateLikelihoodFromBufferEigenSIMD<Vec2d, 2, 20>;
-					} else {
-						computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigenSIMD<Vec2d, 2, 20>;
-						computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigenSIMD<Vec2d, 2, 20>;
-						computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigenSIMD<Vec2d, 2, 20>;
-						computeLikelihoodFromBufferPointer = &PhyloTree::computeMixtureLikelihoodFromBufferEigenSIMD<Vec2d, 2, 20>;
-					}
-				} else {
-					computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec2d, 2, 20>;
-					computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec2d, 2, 20>;
-					computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigenSIMD<Vec2d, 2, 20>;
-					computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferEigenSIMD<Vec2d, 2, 20>;
-				}
-			}
-			break;
-		default:
-			break;
-		}
-		break;
-
-	case 64: // CODON
-		switch(sse) {
-//		case LK_SSE:
-//			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<64>;
-//			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<64>;
-//			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<64>;
-//			computeLikelihoodFromBufferPointer = NULL;
-//			break;
-		case LK_EIGEN_SSE:
-			if (instruction_set >= 7) {
-				setLikelihoodKernelAVX();
-			} else {
-				if (model_factory && model_factory->model->isMixture()) {
-					if (model_factory->fused_mix_rate) {
-						computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigenSIMD<Vec2d, 2, 64>;
-						computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigenSIMD<Vec2d, 2, 64>;
-						computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigenSIMD<Vec2d, 2, 64>;
-						computeLikelihoodFromBufferPointer = &PhyloTree::computeMixrateLikelihoodFromBufferEigenSIMD<Vec2d, 2, 64>;
-//						cout << "Fast-SSE-semi-mixture" << endl;
-					} else {
-						computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigenSIMD<Vec2d, 2, 64>;
-						computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigenSIMD<Vec2d, 2, 64>;
-						computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigenSIMD<Vec2d, 2, 64>;
-						computeLikelihoodFromBufferPointer = &PhyloTree::computeMixtureLikelihoodFromBufferEigenSIMD<Vec2d, 2, 64>;
-//						cout << "Fast-SSE-mixture" << endl;
-					}
-				} else {
-					computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec2d, 2, 64>;
-					computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec2d, 2, 64>;
-					computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigenSIMD<Vec2d, 2, 64>;
-					computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferEigenSIMD<Vec2d, 2, 64>;
-//					cout << "Fast-SSE" << endl;
-				}
-			}
-			break;
-		default:
-			break;
-		}
-		break;
-
-
-	case 2:
-		switch(sse) {
-//		case LK_SSE:
-//			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<2>;
-//			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<2>;
-//			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<2>;
-//	        computeLikelihoodFromBufferPointer = NULL;
-//			break;
-		case LK_EIGEN_SSE:
-			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec2d, 2, 2>;
-			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec2d, 2, 2>;
-			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigenSIMD<Vec2d, 2, 2>;
-	        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferEigenSIMD<Vec2d, 2, 2>;
-			break;
-		default:
-			break;
-		}
-		break;
+#if INSTRSET < 2
+    //--- naive kernel for site-specific model ---
+    if (model_factory && model_factory->model->isSiteSpecificModel()) {
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD<Vec1d, SAFE_LH, false, true>;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD<Vec1d, SAFE_LH, false, true>;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD<Vec1d, SAFE_LH, false, true>;
+        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec1d, SAFE_LH, false, true>;
+        return;
+    }
 
-	default:
-        if (sse == LK_EIGEN_SSE) {
-            if (model_factory && model_factory->model->isMixture()) {
-                if (model_factory->fused_mix_rate) {
-                    computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigen;
-                    computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigen;
-                    computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigen;
-                    computeLikelihoodFromBufferPointer = NULL;
-                } else {
-                    computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigen;
-                    computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigen;
-                    computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigen;
-                    computeLikelihoodFromBufferPointer = NULL;
-                }
-            } else {
-                computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigen;
-                computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigen;
-                computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigen;
-                computeLikelihoodFromBufferPointer = NULL;
-            }
-            sse = LK_EIGEN;
-//        } else {
-//            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchNaive;
-//            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervNaive;
-//            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodNaive;
-//            computeLikelihoodFromBufferPointer = NULL;
-//            sse = LK_NORMAL;
-        }
-		break;
-	}
+    //--- naive (no SIMD) kernel ---
+    computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchGenericSIMD<Vec1d, SAFE_LH>;
+    computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervGenericSIMD<Vec1d, SAFE_LH>;
+    computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodGenericSIMD<Vec1d, SAFE_LH>;
+    computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferGenericSIMD<Vec1d, SAFE_LH>;
+#else
+    computeLikelihoodBranchPointer = NULL;
+    computeLikelihoodDervPointer = NULL;
+    computePartialLikelihoodPointer = NULL;
+    computeLikelihoodFromBufferPointer = NULL;
+#endif
 }
 
 void PhyloTree::changeLikelihoodKernel(LikelihoodKernel lk) {
 	if (sse == lk) return;
-//	if ((sse == LK_EIGEN || sse == LK_EIGEN_SSE) && (lk == LK_NORMAL || lk == LK_SSE)) {
-//		// need to increase the memory usage when changing from new kernel to old kernel
-//        if (params->lh_mem_save == LM_PER_NODE)
-//            params->lh_mem_save = LM_ALL_BRANCH;
-//		setLikelihoodKernel(lk);
-//		deleteAllPartialLh();
-//		initializeAllPartialLh();
-//		clearAllPartialLH();
-//	} else {
-		// otherwise simply assign variable sse
-		setLikelihoodKernel(lk);
-//	}
+    setLikelihoodKernel(lk, num_threads);
 }
 
 /*******************************************************
@@ -332,8 +162,8 @@ void PhyloTree::changeLikelihoodKernel(LikelihoodKernel lk) {
  *
  ******************************************************/
 
-void PhyloTree::computePartialLikelihood(PhyloNeighbor *dad_branch, PhyloNode *dad) {
-	(this->*computePartialLikelihoodPointer)(dad_branch, dad);
+void PhyloTree::computePartialLikelihood(TraversalInfo &info, size_t ptn_left, size_t ptn_right, int thread_id) {
+	(this->*computePartialLikelihoodPointer)(info, ptn_left, ptn_right, thread_id);
 }
 
 double PhyloTree::computeLikelihoodBranch(PhyloNeighbor *dad_branch, PhyloNode *dad) {
@@ -349,13 +179,19 @@ void PhyloTree::computeLikelihoodDerv(PhyloNeighbor *dad_branch, PhyloNode *dad,
 double PhyloTree::computeLikelihoodFromBuffer() {
 	assert(current_it && current_it_back);
 
-	if (computeLikelihoodFromBufferPointer)
+	if (computeLikelihoodFromBufferPointer && optimize_by_newton)
 		return (this->*computeLikelihoodFromBufferPointer)();
 	else
 		return (this->*computeLikelihoodBranchPointer)(current_it, (PhyloNode*)current_it_back->node);
 
 }
 
+double PhyloTree::dotProductDoubleCall(double *x, double *y, int size) {
+    return (this->*dotProductDouble)(x, y, size);
+}
+
+
+
 void PhyloTree::computeTipPartialLikelihood() {
 	if (tip_partial_lh_computed)
 		return;
@@ -371,88 +207,103 @@ void PhyloTree::computeTipPartialLikelihood() {
 	computePtnInvar();
 
     if (getModel()->isSiteSpecificModel()) {
-        ModelSet *models = (ModelSet*)model;
-        size_t nptn = aln->getNPattern(), max_nptn = get_safe_upper_limit(nptn), tip_block_size = max_nptn * aln->num_states;
+//        ModelSet *models = (ModelSet*)model;
+        size_t nptn = aln->getNPattern(), max_nptn = ((nptn+vector_size-1)/vector_size)*vector_size, tip_block_size = max_nptn * aln->num_states;
         int nstates = aln->num_states;
         int nseq = aln->getNSeq();
+        assert(vector_size > 0);
 #ifdef _OPENMP
         #pragma omp parallel for schedule(static)
 #endif
         for (int nodeid = 0; nodeid < nseq; nodeid++) {
-            int i, x;
+            int i, x, v;
             double *partial_lh = tip_partial_lh + tip_block_size*nodeid;
             size_t ptn;
-            for (ptn = 0; ptn < nptn; ptn++, partial_lh += nstates) {
-                int state = aln->at(ptn)[nodeid];
-//                double *partial_lh = node_partial_lh + ptn*nstates;
-                double *inv_evec = models->at(ptn)->getInverseEigenvectors();
-
-                if (state < nstates) {
-                    for (i = 0; i < nstates; i++)
-                        partial_lh[i] = inv_evec[i*nstates+state];
-                } else if (state == aln->STATE_UNKNOWN) {
-                    // special treatment for unknown char
-                    for (i = 0; i < nstates; i++) {
-                        double lh_unknown = 0.0;
-                        double *this_inv_evec = inv_evec + i*nstates;
-                        for (x = 0; x < nstates; x++)
-                            lh_unknown += this_inv_evec[x];
-                        partial_lh[i] = lh_unknown;
-                    }
-                } else {
-                    double lh_ambiguous;
-                    // ambiguous characters
-                    int ambi_aa[] = {
-                        4+8, // B = N or D
-                        32+64, // Z = Q or E
-                        512+1024 // U = I or L
-                        };
-                    switch (aln->seq_type) {
-                    case SEQ_DNA:
-                        {
-                            int cstate = state-nstates+1;
-                            for (i = 0; i < nstates; i++) {
-                                lh_ambiguous = 0.0;
-                                for (x = 0; x < nstates; x++)
-                                    if ((cstate) & (1 << x))
-                                        lh_ambiguous += inv_evec[i*nstates+x];
-                                partial_lh[i] = lh_ambiguous;
-                            }
+            for (ptn = 0; ptn < nptn; ptn+=vector_size, partial_lh += nstates*vector_size) {
+//                int state[vector_size];
+//                for (v = 0; v < vector_size; v++) {
+//                    if (ptn+v < nptn)
+//                        state[v] = aln->at(ptn+v)[nodeid];
+//                    else
+//                        state[v] = aln->STATE_UNKNOWN;
+//                }
+
+                double *inv_evec = &model->getInverseEigenvectors()[ptn*nstates*nstates];
+                for (v = 0; v < vector_size; v++) {
+                    int state = aln->STATE_UNKNOWN;
+                    if (ptn+v < nptn)
+                        state = aln->at(ptn+v)[nodeid];
+    //                double *partial_lh = node_partial_lh + ptn*nstates;
+//                    double *inv_evec = models->at(ptn)->getInverseEigenvectors();
+
+                    if (state < nstates) {
+                        for (i = 0; i < nstates; i++)
+                            partial_lh[i*vector_size+v] = inv_evec[(i*nstates+state)*vector_size+v];
+                    } else if (state == aln->STATE_UNKNOWN) {
+                        // special treatment for unknown char
+                        for (i = 0; i < nstates; i++) {
+                            double lh_unknown = 0.0;
+//                            double *this_inv_evec = inv_evec + i*nstates;
+                            for (x = 0; x < nstates; x++)
+                                lh_unknown += inv_evec[(i*nstates+x)*vector_size+v];
+                            partial_lh[i*vector_size+v] = lh_unknown;
                         }
-                        break;
-                    case SEQ_PROTEIN:
-                        //map[(unsigned char)'B'] = 4+8+19; // N or D
-                        //map[(unsigned char)'Z'] = 32+64+19; // Q or E
-                        {
-                            int cstate = state-nstates;
-                            for (i = 0; i < nstates; i++) {
-                                lh_ambiguous = 0.0;
-                                for (x = 0; x < 11; x++)
-                                    if (ambi_aa[cstate] & (1 << x))
-                                        lh_ambiguous += inv_evec[i*nstates+x];
-                                partial_lh[i] = lh_ambiguous;
+                    } else {
+                        double lh_ambiguous;
+                        // ambiguous characters
+                        int ambi_aa[] = {
+                            4+8, // B = N or D
+                            32+64, // Z = Q or E
+                            512+1024 // U = I or L
+                            };
+                        switch (aln->seq_type) {
+                        case SEQ_DNA:
+                            {
+                                int cstate = state-nstates+1;
+                                for (i = 0; i < nstates; i++) {
+                                    lh_ambiguous = 0.0;
+                                    for (x = 0; x < nstates; x++)
+                                        if ((cstate) & (1 << x))
+                                            lh_ambiguous += inv_evec[(i*nstates+x)*vector_size+v];
+                                    partial_lh[i*vector_size+v] = lh_ambiguous;
+                                }
+                            }
+                            break;
+                        case SEQ_PROTEIN:
+                            //map[(unsigned char)'B'] = 4+8+19; // N or D
+                            //map[(unsigned char)'Z'] = 32+64+19; // Q or E
+                            {
+                                int cstate = state-nstates;
+                                for (i = 0; i < nstates; i++) {
+                                    lh_ambiguous = 0.0;
+                                    for (x = 0; x < 11; x++)
+                                        if (ambi_aa[cstate] & (1 << x))
+                                            lh_ambiguous += inv_evec[(i*nstates+x)*vector_size+v];
+                                    partial_lh[i*vector_size+v] = lh_ambiguous;
+                                }
                             }
+                            break;
+                        default:
+                            assert(0);
+                            break;
                         }
-                        break;
-                    default:
-                        assert(0);
-                        break;
                     }
-                }
-                // sanity check
-//                bool all_zero = true;
-//                for (i = 0; i < nstates; i++)
-//                    if (partial_lh[i] != 0) {
-//                        all_zero = false;
-//                        break;
-//                    }
-//                assert(!all_zero && "some tip_partial_lh are all zeros");
-                
-            }
+                    // sanity check
+    //                bool all_zero = true;
+    //                for (i = 0; i < nstates; i++)
+    //                    if (partial_lh[i] != 0) {
+    //                        all_zero = false;
+    //                        break;
+    //                    }
+    //                assert(!all_zero && "some tip_partial_lh are all zeros");
+                    
+                } // FOR v
+            } // FOR ptn
+            // NO Need to copy dummy anymore
             // dummy values
-            for (ptn = nptn; ptn < max_nptn; ptn++, partial_lh += nstates)
-                memcpy(partial_lh, partial_lh-nstates, nstates*sizeof(double));
-        }
+//            for (ptn = nptn; ptn < max_nptn; ptn++, partial_lh += nstates)
+//                memcpy(partial_lh, partial_lh-nstates, nstates*sizeof(double));
+        } // FOR nodeid
         return;
     }
     
@@ -532,7 +383,7 @@ void PhyloTree::computePtnFreq() {
 	if (ptn_freq_computed) return;
 	ptn_freq_computed = true;
 	size_t nptn = aln->getNPattern();
-	size_t maxptn = get_safe_upper_limit(nptn+model_factory->unobserved_ptns.size());
+	size_t maxptn = get_safe_upper_limit(nptn)+get_safe_upper_limit(model_factory->unobserved_ptns.size());
 	int ptn;
 	for (ptn = 0; ptn < nptn; ptn++)
 		ptn_freq[ptn] = (*aln)[ptn].frequency;
@@ -542,7 +393,7 @@ void PhyloTree::computePtnFreq() {
 
 void PhyloTree::computePtnInvar() {
 	size_t nptn = aln->getNPattern(), ptn;
-	size_t maxptn = get_safe_upper_limit(nptn+model_factory->unobserved_ptns.size());
+	size_t maxptn = get_safe_upper_limit(nptn)+get_safe_upper_limit(model_factory->unobserved_ptns.size());
 	int nstates = aln->num_states;
 
     double *state_freq = aligned_alloc<double>(nstates);
@@ -557,13 +408,13 @@ void PhyloTree::computePtnInvar() {
 				ptn_invar[ptn] = p_invar * state_freq[(int) (*aln)[ptn].const_char];
 			}
 		}
-		// ascertmain bias correction
-		for (ptn = 0; ptn < model_factory->unobserved_ptns.size(); ptn++)
-			ptn_invar[nptn+ptn] = p_invar * state_freq[(int)model_factory->unobserved_ptns[ptn]];
-
+//		// ascertmain bias correction
+//		for (ptn = 0; ptn < model_factory->unobserved_ptns.size(); ptn++)
+//			ptn_invar[nptn+ptn] = p_invar * state_freq[(int)model_factory->unobserved_ptns[ptn]];
+//
 		// dummy values
-		for (ptn = nptn+model_factory->unobserved_ptns.size(); ptn < maxptn; ptn++)
-			ptn_invar[ptn] = ptn_invar[ptn-1];
+		for (ptn = nptn; ptn < maxptn; ptn++)
+			ptn_invar[ptn] = p_invar;
 	}
 	aligned_free(state_freq);
 }
@@ -577,7 +428,7 @@ void PhyloTree::computePtnInvar() {
  *
  ******************************************************/
 
-//template <const int nstates>
+/*
 void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad) {
 
     // don't recompute the likelihood
@@ -602,10 +453,19 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
     size_t ptn, c;
     size_t orig_ntn = aln->size();
     size_t ncat = site_rate->getNRate();
-    const size_t nstatesqr=nstates*nstates;
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
+    size_t mix_addr_nstates[ncat_mix], mix_addr[ncat_mix];
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+    for (c = 0; c < ncat_mix; c++) {
+        size_t m = c/denom;
+        mix_addr_nstates[c] = m*nstates;
+        mix_addr[c] = mix_addr_nstates[c]*nstates;
+    }
     size_t i, x;
-    size_t block = nstates * ncat;
-
+    size_t block = nstates * ncat_mix;
+    size_t tip_block = nstates * model->getNMixtures();
+    size_t scale_size = nptn * ncat_mix;
+    
 	double *evec = model->getEigenvectors();
 	double *inv_evec = model->getInverseEigenvectors();
 	assert(inv_evec && evec);
@@ -615,12 +475,15 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
 
 	// internal node
 	PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
+    int num_leaves = 0;
 	FOR_NEIGHBOR_IT(node, dad, it) {
         PhyloNeighbor *nei = (PhyloNeighbor*)*it;
 		if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
         if ((nei->partial_lh_computed & 1) == 0)
             computePartialLikelihood(nei, node);
         dad_branch->lh_scale_factor += nei->lh_scale_factor;
+        if (nei->node->isLeaf())
+            num_leaves ++;
 	}
 
     if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
@@ -638,12 +501,17 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
                 break;
             }
         }
+        if (!done) {
+            printTree(cout, WT_BR_LEN + WT_NEWLINE);
+        }
         assert(done && "partial_lh is not re-oriented");
     }
 
     // precompute buffer to save times
     double *echildren = new double[block*nstates*(node->degree()-1)];
-    double *partial_lh_leaves = new double[(aln->STATE_UNKNOWN+1)*block*(node->degree()-1)];
+    double *partial_lh_leaves = NULL;
+    if (num_leaves > 0)
+        partial_lh_leaves = new double[(aln->STATE_UNKNOWN+1)*block*num_leaves];
     double *echild = echildren;
     double *partial_lh_leaf = partial_lh_leaves;
 
@@ -651,15 +519,20 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
         double expchild[nstates];
         PhyloNeighbor *child = (PhyloNeighbor*)*it;
         // precompute information buffer
-        for (c = 0; c < ncat; c++) {
-            double len_child = site_rate->getRate(c) * child->length;
+        double *echild_ptr = echild;
+        for (c = 0; c < ncat_mix; c++) {
+            double len_child = site_rate->getRate(c%ncat) * child->length;
+            double *eval_ptr = eval + mix_addr_nstates[c];
+            double *evec_ptr = evec + mix_addr[c];
             for (i = 0; i < nstates; i++) {
-                expchild[i] = exp(eval[i]*len_child);
+                expchild[i] = exp(eval_ptr[i]*len_child);
             }
-            for (x = 0; x < nstates; x++)
+            for (x = 0; x < nstates; x++) {
                 for (i = 0; i < nstates; i++) {
-                    echild[c*nstatesqr+x*nstates+i] = evec[x*nstates+i] * expchild[i];
+                    echild_ptr[i] = evec_ptr[x*nstates+i] * expchild[i];
                 }
+                echild_ptr += nstates;
+            }
         }
 
         // pre compute information for tip
@@ -667,16 +540,23 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
             vector<int>::iterator it;
             for (it = aln->seq_states[child->node->id].begin(); it != aln->seq_states[child->node->id].end(); it++) {
                 int state = (*it);
-                for (x = 0; x < block; x++) {
-                    double vchild = 0.0;
-                    for (i = 0; i < nstates; i++) {
-                        vchild += echild[x*nstates+i] * tip_partial_lh[state*nstates+i];
+                double *this_partial_lh_leaf = partial_lh_leaf + state*block;
+                double *echild_ptr = echild;
+                for (c = 0; c < ncat_mix; c++) {
+                    double *this_tip_partial_lh = tip_partial_lh + state*tip_block + mix_addr_nstates[c];
+                    for (x = 0; x < nstates; x++) {
+                        double vchild = 0.0;
+                        for (i = 0; i < nstates; i++) {
+                            vchild += echild_ptr[i] * this_tip_partial_lh[i];
+                        }
+                        this_partial_lh_leaf[x] = vchild;
+                        echild_ptr += nstates;
                     }
-                    partial_lh_leaf[state*block+x] = vchild;
+                    this_partial_lh_leaf += nstates;
                 }
             }
+            size_t addr = aln->STATE_UNKNOWN * block;
             for (x = 0; x < block; x++) {
-                size_t addr = aln->STATE_UNKNOWN * block;
                 partial_lh_leaf[addr+x] = 1.0;
             }
             partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
@@ -685,9 +565,6 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
     }
     
     
-    double sum_scale = 0.0;
-    
-        
     double *eleft = echildren, *eright = echildren + block*nstates;
     
 	if (!left->node->isLeaf() && right->node->isLeaf()) {
@@ -701,23 +578,25 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
     
     if (node->degree() > 3) {
 
-        /*--------------------- multifurcating node ------------------*/
+        //--------------------- multifurcating node ------------------//
     
         // now for-loop computing partial_lh over all site-patterns
 #ifdef _OPENMP
-#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
+#pragma omp parallel for private(ptn, c, x, i) schedule(static)
 #endif
         for (ptn = 0; ptn < nptn; ptn++) {
             double partial_lh_all[block];
             for (i = 0; i < block; i++)
                 partial_lh_all[i] = 1.0;
-            dad_branch->scale_num[ptn] = 0;
+            UBYTE *scale_dad = dad_branch->scale_num + ptn*ncat_mix;
+            memset(scale_dad, 0, sizeof(UBYTE)*ncat_mix);
                 
             double *partial_lh_leaf = partial_lh_leaves;
             double *echild = echildren;
 
             FOR_NEIGHBOR_IT(node, dad, it) {
                 PhyloNeighbor *child = (PhyloNeighbor*)*it;
+                UBYTE *scale_child = child->scale_num + ptn*ncat_mix;
                 if (child->node->isLeaf()) {
                     // external node
                     int state_child = (ptn < orig_ntn) ? (aln->at(ptn))[child->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
@@ -731,10 +610,10 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
                     // internal node
                     double *partial_lh = partial_lh_all;
                     double *partial_lh_child = child->partial_lh + ptn*block;
-                    dad_branch->scale_num[ptn] += child->scale_num[ptn];
 
                     double *echild_ptr = echild;
-                    for (c = 0; c < ncat; c++) {
+                    for (c = 0; c < ncat_mix; c++) {
+                        scale_dad[c] += scale_child[c];
                         // compute real partial likelihood vector
                         for (x = 0; x < nstates; x++) {
                             double vchild = 0.0;
@@ -754,11 +633,11 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
             
         
             // compute dot-product with inv_eigenvector
-            double lh_max = 0.0;
             double *partial_lh_tmp = partial_lh_all;
             double *partial_lh = dad_branch->partial_lh + ptn*block;
-            for (c = 0; c < ncat; c++) {
-                double *inv_evec_ptr = inv_evec;
+            for (c = 0; c < ncat_mix; c++) {
+                double lh_max = 0.0;
+                double *inv_evec_ptr = inv_evec + mix_addr[c];
                 for (i = 0; i < nstates; i++) {
                     double res = 0.0;
                     for (x = 0; x < nstates; x++) {
@@ -768,102 +647,85 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
                     partial_lh[i] = res;
                     lh_max = max(lh_max, fabs(res));
                 }
-                partial_lh += nstates;
-                partial_lh_tmp += nstates;
-            }
-            // check if one should scale partial likelihoods
-            if (lh_max < SCALING_THRESHOLD) {
-                partial_lh = dad_branch->partial_lh + ptn*block;
-                if (lh_max == 0.0) {
-                    // for very shitty data
-                    for (c = 0; c < ncat; c++)
-                        memcpy(&partial_lh[c*nstates], &tip_partial_lh[aln->STATE_UNKNOWN*nstates], nstates*sizeof(double));
-                    sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
-                    //sum_scale += log(lh_max) * ptn_freq[ptn];
-                    dad_branch->scale_num[ptn] += 4;
-                    int nsite = aln->getNSite();
-                    for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
-                        if (aln->getPatternID(i) == ptn) {
-                            outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
-                            x++;
-                        }
-                } else if (ptn_invar[ptn] == 0.0) {
-                    // now do the likelihood scaling
-                    for (i = 0; i < block; i++) {
-                        partial_lh[i] *= SCALING_THRESHOLD_INVER;
-                        //partial_lh[i] /= lh_max;
+                // check if one should scale partial likelihoods
+                if (lh_max < SCALING_THRESHOLD && lh_max != 0.0) {
+                    //assert(lh_max != 0.0 && "Numerical underflow for multifurcation node");
+                    if (ptn_invar[ptn] == 0.0) {
+                        // now do the likelihood scaling
+                        for (i = 0; i < nstates; i++)
+                            partial_lh[i] *= SCALING_THRESHOLD_INVER;
+                        scale_dad[c] += 1;
                     }
-                    // unobserved const pattern will never have underflow
-                    sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
-                    //sum_scale += log(lh_max) * ptn_freq[ptn];
-                    dad_branch->scale_num[ptn] += 1;
                 }
+                partial_lh += nstates;
+                partial_lh_tmp += nstates;
             }
 
         } // for ptn
-        dad_branch->lh_scale_factor += sum_scale;               
+//        dad_branch->lh_scale_factor += sum_scale;               
                 
         // end multifurcating treatment
     } else if (left->node->isLeaf() && right->node->isLeaf()) {
 
-        /*--------------------- TIP-TIP (cherry) case ------------------*/
+        //--------------------- TIP-TIP (cherry) case ------------------//
 
         double *partial_lh_left = partial_lh_leaves;
         double *partial_lh_right = partial_lh_leaves + (aln->STATE_UNKNOWN+1)*block;
 
 		// scale number must be ZERO
-	    memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+	    memset(dad_branch->scale_num, 0, scale_size * sizeof(UBYTE));
 #ifdef _OPENMP
 #pragma omp parallel for private(ptn, c, x, i) schedule(static)
 #endif
 		for (ptn = 0; ptn < nptn; ptn++) {
 			double partial_lh_tmp[nstates];
 			double *partial_lh = dad_branch->partial_lh + ptn*block;
-			int state_left = (ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
-			int state_right = (ptn < orig_ntn) ? (aln->at(ptn))[right->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
-			for (c = 0; c < ncat; c++) {
+			double *vleft = partial_lh_left + block*((ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn]);
+			double *vright = partial_lh_right + block*((ptn < orig_ntn) ? (aln->at(ptn))[right->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn]);
+			for (c = 0; c < ncat_mix; c++) {
+                double *inv_evec_ptr = inv_evec + mix_addr[c];
 				// compute real partial likelihood vector
-				double *left = partial_lh_left + (state_left*block+c*nstates);
-				double *right = partial_lh_right + (state_right*block+c*nstates);
 				for (x = 0; x < nstates; x++) {
-					partial_lh_tmp[x] = left[x] * right[x];
+					partial_lh_tmp[x] = vleft[x] * vright[x];
 				}
 
 				// compute dot-product with inv_eigenvector
-                double *inv_evec_ptr = inv_evec;
 				for (i = 0; i < nstates; i++) {
 					double res = 0.0;
 					for (x = 0; x < nstates; x++) {
 						res += partial_lh_tmp[x]*inv_evec_ptr[x];
 					}
                     inv_evec_ptr += nstates;
-					partial_lh[c*nstates+i] = res;
+					partial_lh[i] = res;
 				}
+                vleft += nstates;
+                vright += nstates;
+                partial_lh += nstates;
 			}
 		}
 	} else if (left->node->isLeaf() && !right->node->isLeaf()) {
 
-        /*--------------------- TIP-INTERNAL NODE case ------------------*/
+        //--------------------- TIP-INTERNAL NODE case ------------------//
 
 		// only take scale_num from the right subtree
-		memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
+		memcpy(dad_branch->scale_num, right->scale_num, scale_size * sizeof(UBYTE));
 
 
         double *partial_lh_left = partial_lh_leaves;
 
 #ifdef _OPENMP
-#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
+#pragma omp parallel for private(ptn, c, x, i) schedule(static)
 #endif
 		for (ptn = 0; ptn < nptn; ptn++) {
 			double partial_lh_tmp[nstates];
 			double *partial_lh = dad_branch->partial_lh + ptn*block;
 			double *partial_lh_right = right->partial_lh + ptn*block;
-			int state_left = (ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
-            double *vleft = partial_lh_left + state_left*block;
-            double lh_max = 0.0;
-            
+			double *vleft = partial_lh_left + block*((ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn]);
+
             double *eright_ptr = eright;
-			for (c = 0; c < ncat; c++) {
+			for (c = 0; c < ncat_mix; c++) {
+                double lh_max = 0.0;
+                double *inv_evec_ptr = inv_evec + mix_addr[c];
 				// compute real partial likelihood vector
 				for (x = 0; x < nstates; x++) {
 					double vright = 0.0;
@@ -875,73 +737,59 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
                     eright_ptr += nstates;
 					partial_lh_tmp[x] = vleft[x] * (vright);
 				}
-                vleft += nstates;
-                partial_lh_right += nstates;
                 
 				// compute dot-product with inv_eigenvector
-                double *inv_evec_ptr = inv_evec;
 				for (i = 0; i < nstates; i++) {
 					double res = 0.0;
 					for (x = 0; x < nstates; x++) {
 						res += partial_lh_tmp[x]*inv_evec_ptr[x];
 					}
                     inv_evec_ptr += nstates;
-					partial_lh[c*nstates+i] = res;
+					partial_lh[i] = res;
                     lh_max = max(fabs(res), lh_max);
 				}
+                // check if one should scale partial likelihoods
+                if (lh_max < SCALING_THRESHOLD && lh_max != 0.0) {
+                    //assert(lh_max != 0.0 && "Numerical underflow for tip-inner node");
+                    if (ptn_invar[ptn] == 0.0) {
+                        // now do the likelihood scaling
+                        for (i = 0; i < nstates; i++)
+                            partial_lh[i] *= SCALING_THRESHOLD_INVER;
+                        dad_branch->scale_num[ptn*ncat_mix+c] += 1;
+                    }
+                }
+                vleft += nstates;
+                partial_lh_right += nstates;
+                partial_lh += nstates;
 			}
-            // check if one should scale partial likelihoods
-            if (lh_max < SCALING_THRESHOLD) {
-            	if (lh_max == 0.0) {
-            		// for very shitty data
-            		for (c = 0; c < ncat; c++)
-            			memcpy(&partial_lh[c*nstates], &tip_partial_lh[aln->STATE_UNKNOWN*nstates], nstates*sizeof(double));
-					sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
-					//sum_scale += log(lh_max) * ptn_freq[ptn];
-					dad_branch->scale_num[ptn] += 4;
-					int nsite = aln->getNSite();
-					for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
-						if (aln->getPatternID(i) == ptn) {
-							outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
-							x++;
-						}
-            	} else if (ptn_invar[ptn] == 0.0) {
-					// now do the likelihood scaling
-					for (i = 0; i < block; i++) {
-						partial_lh[i] *= SCALING_THRESHOLD_INVER;
-	                    //partial_lh[i] /= lh_max;
-					}
-					// unobserved const pattern will never have underflow
-					sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
-					//sum_scale += log(lh_max) * ptn_freq[ptn];
-					dad_branch->scale_num[ptn] += 1;
-            	}
-            }
-
 
 		}
-		dad_branch->lh_scale_factor += sum_scale;
+//		dad_branch->lh_scale_factor += sum_scale;
 //		delete [] partial_lh_left;
 
 	} else {
 
-        /*--------------------- INTERNAL-INTERNAL NODE case ------------------*/
+        //--------------------- INTERNAL-INTERNAL NODE case ------------------//
 
 #ifdef _OPENMP
-#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
+#pragma omp parallel for private(ptn, c, x, i) schedule(static)
 #endif
 		for (ptn = 0; ptn < nptn; ptn++) {
 			double partial_lh_tmp[nstates];
 			double *partial_lh = dad_branch->partial_lh + ptn*block;
 			double *partial_lh_left = left->partial_lh + ptn*block;
 			double *partial_lh_right = right->partial_lh + ptn*block;
-            double lh_max = 0.0;
-			dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
+            UBYTE *scale_dad = dad_branch->scale_num + ptn*ncat_mix;
+            UBYTE *scale_left = left->scale_num + ptn*ncat_mix;
+            UBYTE *scale_right = right->scale_num + ptn*ncat_mix; 
 
             double *eleft_ptr = eleft;
             double *eright_ptr = eright;
 
-			for (c = 0; c < ncat; c++) {
+			for (c = 0; c < ncat_mix; c++) {
+                scale_dad[c] = scale_left[c] + scale_right[c];
+                double lh_max = 0.0;
+                double *inv_evec_ptr = inv_evec + mix_addr[c];
 				// compute real partial likelihood vector
 				for (x = 0; x < nstates; x++) {
 					double vleft = 0.0, vright = 0.0;
@@ -955,61 +803,43 @@ void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNo
 					partial_lh_tmp[x] = vleft*vright;
 //                    assert(partial_lh_tmp[x] != 0.0);
 				}
-                partial_lh_left += nstates;
-                partial_lh_right += nstates;
                 
 				// compute dot-product with inv_eigenvector
-                double *inv_evec_ptr = inv_evec;
 				for (i = 0; i < nstates; i++) {
 					double res = 0.0;
 					for (x = 0; x < nstates; x++) {
 						res += partial_lh_tmp[x]*inv_evec_ptr[x];
 					}
                     inv_evec_ptr += nstates;
-					partial_lh[c*nstates+i] = res;
+					partial_lh[i] = res;
                     lh_max = max(lh_max, fabs(res));
 				}
+                // check if one should scale partial likelihoods
+                if (lh_max < SCALING_THRESHOLD && lh_max != 0.0) {
+                    //assert(lh_max != 0.0 && "Numerical underflow for inner-inner node");
+                    if (ptn_invar[ptn] == 0.0) {
+                        // BQM 2016-05-03: only scale for non-constant sites
+                        // now do the likelihood scaling
+                        for (i = 0; i < nstates; i++)
+                            partial_lh[i] *= SCALING_THRESHOLD_INVER;
+                        scale_dad[c] += 1;
+                    }
+                }
+                partial_lh_left += nstates;
+                partial_lh_right += nstates;
+                partial_lh += nstates;
 			}
 
-            // check if one should scale partial likelihoods
-            if (lh_max < SCALING_THRESHOLD) {
-            	if (lh_max == 0.0) {
-            		// for very shitty data
-            		for (c = 0; c < ncat; c++)
-            			memcpy(&partial_lh[c*nstates], &tip_partial_lh[aln->STATE_UNKNOWN*nstates], nstates*sizeof(double));
-					sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
-					//sum_scale += log(lh_max) * ptn_freq[ptn];
-					dad_branch->scale_num[ptn] += 4;
-					int nsite = aln->getNSite();
-					for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
-						if (aln->getPatternID(i) == ptn) {
-							outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
-							x++;
-						}
-            	} else if (ptn_invar[ptn] == 0.0) {
-                    // BQM 2016-05-03: only scale for non-constant sites
-					// now do the likelihood scaling
-					for (i = 0; i < block; i++) {
-						partial_lh[i] *= SCALING_THRESHOLD_INVER;
-	                    //partial_lh[i] /= lh_max;
-					}
-					// unobserved const pattern will never have underflow
-					sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
-					//sum_scale += log(lh_max) * ptn_freq[ptn];
-					dad_branch->scale_num[ptn] += 1;
-            	}
-            }
-
 		}
-		dad_branch->lh_scale_factor += sum_scale;
+//		dad_branch->lh_scale_factor += sum_scale;
 
 	}
 
-    delete [] partial_lh_leaves;
+    if (partial_lh_leaves)
+        delete [] partial_lh_leaves;
     delete [] echildren;
 }
 
-//template <const int nstates>
 void PhyloTree::computeLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
     PhyloNode *node = (PhyloNode*) dad_branch->node;
     PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
@@ -1030,12 +860,23 @@ void PhyloTree::computeLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode
         
     size_t nstates = aln->num_states;
     size_t ncat = site_rate->getNRate();
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
 
-    size_t block = ncat * nstates;
+    size_t block = ncat_mix * nstates;
+    size_t tip_block = nstates * model->getNMixtures();
     size_t ptn; // for big data size > 4GB memory required
     size_t c, i;
     size_t orig_nptn = aln->size();
     size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+
+    size_t mix_addr_nstates[ncat_mix], mix_addr[ncat_mix];
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+    for (c = 0; c < ncat_mix; c++) {
+        size_t m = c/denom;
+        mix_addr_nstates[c] = m*nstates;
+        mix_addr[c] = mix_addr_nstates[c]*nstates;
+    }
+
     double *eval = model->getEigenvalues();
     assert(eval);
 
@@ -1050,11 +891,24 @@ void PhyloTree::computeLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode
 #endif
 	    	for (ptn = 0; ptn < nptn; ptn++) {
 				double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+                UBYTE *scale_dad = dad_branch->scale_num+ptn*ncat_mix;
 				double *theta = theta_all + ptn*block;
-				double *lh_tip = tip_partial_lh + ((int)((ptn < orig_nptn) ? (aln->at(ptn))[dad->id] :  model_factory->unobserved_ptns[ptn-orig_nptn]))*nstates;
-                for (c = 0; c < ncat; c++) {
-                    for (i = 0; i < nstates; i++) {
-                        theta[i] = lh_tip[i] * partial_lh_dad[i];
+                double *this_tip_partial_lh = tip_partial_lh + tip_block*((ptn < orig_nptn) ? (aln->at(ptn))[dad->id] :  model_factory->unobserved_ptns[ptn-orig_nptn]);
+                UBYTE min_scale = scale_dad[0];
+                for (c = 1; c < ncat_mix; c++)
+                    min_scale = min(min_scale, scale_dad[c]);
+                for (c = 0; c < ncat_mix; c++) {
+                    double *lh_tip = this_tip_partial_lh + mix_addr_nstates[c];
+                    if (scale_dad[c] == min_scale) {
+                        for (i = 0; i < nstates; i++) {
+                            theta[i] = lh_tip[i] * partial_lh_dad[i];
+                        }
+                    } else if (scale_dad[c] == min_scale+1) {
+                        for (i = 0; i < nstates; i++) {
+                            theta[i] = lh_tip[i] * partial_lh_dad[i] * SCALING_THRESHOLD;
+                        }
+                    } else {
+                        memset(theta, 0, sizeof(double)*nstates);
                     }
                     partial_lh_dad += nstates;
                     theta += nstates;
@@ -1067,15 +921,38 @@ void PhyloTree::computeLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode
 
 //	    	size_t all_entries = nptn*block;
 #ifdef _OPENMP
-#pragma omp parallel for private(ptn, i) schedule(static)
+#pragma omp parallel for private(ptn, i, c) schedule(static)
 #endif
 	    	for (ptn = 0; ptn < nptn; ptn++) {
 				double *theta = theta_all + ptn*block;
 			    double *partial_lh_node = node_branch->partial_lh + ptn*block;
 			    double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
-	    		for (i = 0; i < block; i++) {
-	    			theta[i] = partial_lh_node[i] * partial_lh_dad[i];
-	    		}
+
+                size_t ptn_ncat = ptn*ncat_mix; 
+                UBYTE *scale_dad = dad_branch->scale_num + ptn_ncat;
+                UBYTE *scale_node = node_branch->scale_num + ptn_ncat;
+                UBYTE sum_scale[ncat_mix];
+                UBYTE min_scale = sum_scale[0] = scale_dad[0] + scale_node[0];
+                for (c = 1; c < ncat_mix; c++) {
+                    sum_scale[c] = scale_dad[c] + scale_node[c];
+                    min_scale = min(min_scale, sum_scale[c]);
+                }
+                for (c = 0; c < ncat_mix; c++) {
+                    if (sum_scale[c] == min_scale) {
+                        for (i = 0; i < nstates; i++) {
+                            theta[i] = partial_lh_node[i] * partial_lh_dad[i];
+                        }
+                    } else if (sum_scale[c] == min_scale+1) {
+                        for (i = 0; i < nstates; i++) {
+                            theta[i] = partial_lh_node[i] * partial_lh_dad[i] * SCALING_THRESHOLD;
+                        }
+                    } else {
+                        memset(theta, 0, sizeof(double)*nstates);
+                    }
+                    theta += nstates;
+                    partial_lh_dad += nstates;
+                    partial_lh_node += nstates;
+                }
 			}
 	    }
 		theta_computed = true;
@@ -1084,15 +961,19 @@ void PhyloTree::computeLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode
     double *val0 = new double[block];
     double *val1 = new double[block];
     double *val2 = new double[block];
-	for (c = 0; c < ncat; c++) {
-		double prop = site_rate->getProp(c);
+	for (c = 0; c < ncat_mix; c++) {
+        size_t m = c/denom;
+        double *eval_ptr = eval + mix_addr_nstates[c];
+        size_t mycat = c%ncat;
+        double prop = site_rate->getProp(mycat) * model->getMixtureWeight(m);
+        size_t addr = c*nstates;
 		for (i = 0; i < nstates; i++) {
-			double cof = eval[i]*site_rate->getRate(c);
+			double cof = eval_ptr[i]*site_rate->getRate(mycat);
 			double val = exp(cof*dad_branch->length) * prop;
 			double val1_ = cof*val;
-			val0[c*nstates+i] = val;
-			val1[c*nstates+i] = val1_;
-			val2[c*nstates+i] = cof*val1_;
+			val0[addr+i] = val;
+			val1[addr+i] = val1_;
+			val2[addr+i] = cof*val1_;
 		}
 	}
 
@@ -1132,6 +1013,9 @@ void PhyloTree::computeLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode
     }
 	df = my_df;
 	ddf = my_ddf;
+    
+    assert(!isnan(df) && !isinf(df) && "Numerical underflow for lh-derivative");
+
     if (isnan(df) || isinf(df)) {
         df = 0.0;
         ddf = 0.0;
@@ -1177,25 +1061,37 @@ double PhyloTree::computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloN
     double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
     size_t nstates = aln->num_states;
     size_t ncat = site_rate->getNRate();
+    size_t ncat_mix = (model_factory->fused_mix_rate) ? ncat : ncat*model->getNMixtures();
 
-    size_t block = ncat * nstates;
+    size_t block = ncat_mix * nstates;
+    size_t tip_block = nstates * model->getNMixtures();
     size_t ptn; // for big data size > 4GB memory required
     size_t c, i;
     size_t orig_nptn = aln->size();
     size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+
+    size_t mix_addr_nstates[ncat_mix], mix_addr[ncat_mix];
+    size_t denom = (model_factory->fused_mix_rate) ? 1 : ncat;
+
     double *eval = model->getEigenvalues();
     assert(eval);
 
     double *val = new double[block];
-	for (c = 0; c < ncat; c++) {
-		double len = site_rate->getRate(c)*dad_branch->length;
-		double prop = site_rate->getProp(c);
+	for (c = 0; c < ncat_mix; c++) {
+        size_t mycat = c%ncat;
+        size_t m = c/denom;
+        mix_addr_nstates[c] = m*nstates;
+        mix_addr[c] = mix_addr_nstates[c]*nstates;
+        double *eval_ptr = eval + mix_addr_nstates[c];
+		double len = site_rate->getRate(mycat)*dad_branch->length;
+		double prop = site_rate->getProp(mycat) * model->getMixtureWeight(m);
+        double *this_val = val + c*nstates;
 		for (i = 0; i < nstates; i++)
-			val[c*nstates+i] = exp(eval[i]*len) * prop;
+			this_val[i] = exp(eval_ptr[i]*len) * prop;
 	}
 
 	double prob_const = 0.0;
-	memset(_pattern_lh_cat, 0, nptn*ncat*sizeof(double));
+	memset(_pattern_lh_cat, 0, sizeof(double)*nptn*ncat_mix);
 
     if (dad->isLeaf()) {
     	// special treatment for TIP-INTERNAL NODE case
@@ -1205,9 +1101,10 @@ double PhyloTree::computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloN
     	// precompute information from one tip
     	for (IntVector::iterator it = states_dad.begin(); it != states_dad.end(); it++) {
     		double *lh_node = partial_lh_node +(*it)*block;
-    		double *lh_tip = tip_partial_lh + (*it)*nstates;
     		double *val_tmp = val;
-			for (c = 0; c < ncat; c++) {
+            double *this_tip_partial_lh = tip_partial_lh + (*it)*tip_block;
+			for (c = 0; c < ncat_mix; c++) {
+                double *lh_tip = this_tip_partial_lh + mix_addr_nstates[c];
 				for (i = 0; i < nstates; i++) {
 					  lh_node[i] = val_tmp[i] * lh_tip[i];
 				}
@@ -1222,27 +1119,37 @@ double PhyloTree::computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloN
 #endif
     	for (ptn = 0; ptn < nptn; ptn++) {
 			double lh_ptn = ptn_invar[ptn];
-            double *lh_cat = _pattern_lh_cat + ptn*ncat;
+            double *lh_cat = _pattern_lh_cat + ptn*ncat_mix;
             double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
-            int state_dad = (ptn < orig_nptn) ? (aln->at(ptn))[dad->id] : model_factory->unobserved_ptns[ptn-orig_nptn];
-            double *lh_node = partial_lh_node + state_dad*block;
-            for (c = 0; c < ncat; c++) {
-                for (i = 0; i < nstates; i++) {
-                    *lh_cat += lh_node[i] * partial_lh_dad[i];
+            UBYTE *scale_dad = dad_branch->scale_num + ptn*ncat_mix;
+            double *lh_node = partial_lh_node + block*((ptn < orig_nptn) ? (aln->at(ptn))[dad->id] : model_factory->unobserved_ptns[ptn-orig_nptn]);
+            // determine the min scaling
+            UBYTE min_scale = scale_dad[0];
+            for (c = 1; c < ncat_mix; c++) 
+                min_scale = min(min_scale, scale_dad[c]);
+
+            for (c = 0; c < ncat_mix; c++) {
+                if (scale_dad[c] <= min_scale+1) {
+                    // only compute for least scale category
+                    for (i = 0; i < nstates; i++) {
+                        *lh_cat += (lh_node[i] * partial_lh_dad[i]);
+                    }
+                    if (scale_dad[c] != min_scale)
+                        *lh_cat *= SCALING_THRESHOLD;
+                    lh_ptn += *lh_cat;
                 }
                 lh_node += nstates;
                 partial_lh_dad += nstates;
-                lh_ptn += *lh_cat;
                 lh_cat++;
             }
 //			assert(lh_ptn > -1e-10);
 			if (ptn < orig_nptn) {
-				lh_ptn = log(fabs(lh_ptn));
+				lh_ptn = log(fabs(lh_ptn)) + LOG_SCALING_THRESHOLD*min_scale;
 				_pattern_lh[ptn] = lh_ptn;
 				tree_lh += lh_ptn * ptn_freq[ptn];
 			} else {
                 // bugfix 2016-01-21, prob_const can be rescaled
-                if (dad_branch->scale_num[ptn] >= 1)
+                if (min_scale >= 1)
                     lh_ptn *= SCALING_THRESHOLD;
 //				_pattern_lh[ptn] = lh_ptn;
 				prob_const += lh_ptn;
@@ -1256,15 +1163,28 @@ double PhyloTree::computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloN
 #endif
     	for (ptn = 0; ptn < nptn; ptn++) {
 			double lh_ptn = ptn_invar[ptn];
-            double *lh_cat = _pattern_lh_cat + ptn*ncat;
+            double *lh_cat = _pattern_lh_cat + ptn*ncat_mix;
             double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
             double *partial_lh_node = node_branch->partial_lh + ptn*block;
             double *val_tmp = val;
-            for (c = 0; c < ncat; c++) {
-                for (i = 0; i < nstates; i++) {
-                    *lh_cat +=  val_tmp[i] * partial_lh_node[i] * partial_lh_dad[i];
+            UBYTE *scale_dad = dad_branch->scale_num + ptn*ncat_mix;
+            UBYTE *scale_node = node_branch->scale_num + ptn*ncat_mix;
+            UBYTE sum_scale[ncat_mix];
+            UBYTE min_scale = sum_scale[0] = scale_dad[0]+scale_node[0];
+            for (c = 1; c < ncat_mix; c++) {
+                sum_scale[c] = scale_dad[c] + scale_node[c];
+                min_scale = min(min_scale, sum_scale[c]);
+            }
+            for (c = 0; c < ncat_mix; c++) {
+                if (sum_scale[c] <= min_scale+1) {
+                    // only compute for least scale category
+                    for (i = 0; i < nstates; i++) {
+                        *lh_cat +=  (val_tmp[i] * partial_lh_node[i] * partial_lh_dad[i]);
+                    }
+                    if (sum_scale[c] != min_scale)
+                        *lh_cat *= SCALING_THRESHOLD;
+                    lh_ptn += *lh_cat;
                 }
-                lh_ptn += *lh_cat;
                 partial_lh_node += nstates;
                 partial_lh_dad += nstates;
                 val_tmp += nstates;
@@ -1273,12 +1193,12 @@ double PhyloTree::computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloN
 
 //			assert(lh_ptn > 0.0);
             if (ptn < orig_nptn) {
-				lh_ptn = log(fabs(lh_ptn));
+				lh_ptn = log(fabs(lh_ptn)) + LOG_SCALING_THRESHOLD*min_scale;
 				_pattern_lh[ptn] = lh_ptn;
 				tree_lh += lh_ptn * ptn_freq[ptn];
 			} else {
                 // bugfix 2016-01-21, prob_const can be rescaled
-                if (dad_branch->scale_num[ptn] + node_branch->scale_num[ptn] >= 1)
+                if (min_scale >= 1)
                     lh_ptn *= SCALING_THRESHOLD;
 //				_pattern_lh[ptn] = lh_ptn;
 				prob_const += lh_ptn;
@@ -1286,30 +1206,7 @@ double PhyloTree::computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloN
 		}
     }
 
-    if (isnan(tree_lh) || isinf(tree_lh)) {
-        cout << "WARNING: Numerical underflow caused by alignment sites";
-        i = aln->getNSite();
-        int j;
-        for (j = 0, c = 0; j < i; j++) {
-            ptn = aln->getPatternID(j);
-            if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
-                cout << " " << j+1;
-                c++;
-                if (c >= 10) {
-                    cout << " ...";
-                    break;
-                }
-            }
-        }
-        cout << endl;
-        tree_lh = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
-        for (ptn = 0; ptn < orig_nptn; ptn++) {
-            if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
-                _pattern_lh[ptn] = LOG_SCALING_THRESHOLD*4; // log(2^(-1024))
-            }
-            tree_lh += _pattern_lh[ptn] * ptn_freq[ptn];
-        }
-    }
+    assert(!isnan(tree_lh) && !isinf(tree_lh) && "Numerical underflow for lh-branch");
 
     if (orig_nptn < nptn) {
     	// ascertainment bias correction
@@ -1337,1367 +1234,377 @@ double PhyloTree::computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloN
     delete [] val;
     return tree_lh;
 }
+*/
 
 
-
-/************************************************************************************************
+/*******************************************************
  *
- *   non-vectorized fused mixture and rate likelihood functions
+ * ancestral sequence reconstruction
  *
- *************************************************************************************************/
-
-//template <const int nstates>
-void PhyloTree::computeMixratePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad) {
-    // don't recompute the likelihood
-	assert(dad);
-    if (dad_branch->partial_lh_computed & 1)
-        return;
-    dad_branch->partial_lh_computed |= 1;
+ ******************************************************/
 
-    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
-    PhyloNode *node = (PhyloNode*)(dad_branch->node);
 
-    if (!tip_partial_lh_computed)
-        computeTipPartialLikelihood();
+void PhyloTree::computeMarginalAncestralProbability(PhyloNeighbor *dad_branch, PhyloNode *dad, double *ptn_ancestral_prob) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    assert(!node->isLeaf());
 
-	if (node->isLeaf()) {
-	    dad_branch->lh_scale_factor = 0.0;
-		return;
-	}
+    // TODO: not working yet
 
+//    if ((dad_branch->partial_lh_computed & 1) == 0)
+//        computePartialLikelihood(dad_branch, dad);
+//    if ((node_branch->partial_lh_computed & 1) == 0)
+//        computePartialLikelihood(node_branch, node);
     size_t nstates = aln->num_states;
-    size_t ptn, c;
-    size_t orig_ntn = aln->size();
-    size_t ncat = site_rate->getNRate();
-    assert(ncat == model->getNMixtures());
     const size_t nstatesqr=nstates*nstates;
-    size_t i, x;
-    size_t block = nstates * ncat;
+    size_t ncat = site_rate->getNRate();
+    size_t statecat = nstates * ncat;
+    size_t nmixture = model->getNMixtures();
 
-	double *evec = model->getEigenvectors();
-	double *inv_evec = model->getInverseEigenvectors();
-	assert(inv_evec && evec);
-	double *eval = model->getEigenvalues();
+    size_t block = ncat * nstates * nmixture;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i, m, x;
+    size_t nptn = aln->size();
+    double *eval = model->getEigenvalues();
+    double *evec = model->getEigenvectors();
+    double *inv_evec = model->getInverseEigenvectors();
+    assert(eval);
 
-    dad_branch->lh_scale_factor = 0.0;
+    double echild[block*nstates];
 
-	// internal node
-//	assert(node->degree() == 3); // it works only for strictly bifurcating tree
-	PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
-	FOR_NEIGHBOR_IT(node, dad, it) {
-        PhyloNeighbor *nei = (PhyloNeighbor*)*it;
-		if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
-        if ((nei->partial_lh_computed & 1) == 0)
-            computePartialLikelihood(nei, node);
-        dad_branch->lh_scale_factor += nei->lh_scale_factor;
-	}
-        
-    if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
-        // re-orient partial_lh
-        bool done = false;
-        FOR_NEIGHBOR_IT(node, dad, it2) {
-            PhyloNeighbor *backnei = ((PhyloNeighbor*)(*it2)->node->findNeighbor(node));
-            if (backnei->partial_lh) {
-                dad_branch->partial_lh = backnei->partial_lh;
-                dad_branch->scale_num = backnei->scale_num;
-                backnei->partial_lh = NULL;
-                backnei->scale_num = NULL;
-                backnei->partial_lh_computed &= ~1; // clear bit
-                done = true;
-                break;
+    for (c = 0; c < ncat; c++) {
+        double expchild[nstates];
+        double len_child = site_rate->getRate(c) * dad_branch->length;
+        for (m = 0; m < nmixture; m++) {
+            for (i = 0; i < nstates; i++) {
+                expchild[i] = exp(eval[m*nstates+i]*len_child);
             }
+            for (x = 0; x < nstates; x++)
+                for (i = 0; i < nstates; i++) {
+                    echild[(m*ncat+c)*nstatesqr+x*nstates+i] = evec[m*nstatesqr+x*nstates+i] * expchild[i];
+                }
         }
-        assert(done && "partial_lh is not re-oriented");
-    }        
-        
-    // precompute buffer to save times
-    double *echildren = new double[block*nstates*(node->degree()-1)];
-    double *partial_lh_leaves = new double[(aln->STATE_UNKNOWN+1)*block*(node->degree()-1)];
-    double *echild = echildren;
-    double *partial_lh_leaf = partial_lh_leaves;
-
-    FOR_NEIGHBOR_IT(node, dad, it) {
-        double expchild[nstates];
-        PhyloNeighbor *child = (PhyloNeighbor*)*it;
-        // precompute information buffer
-        for (c = 0; c < ncat; c++) {
-            double len_child = site_rate->getRate(c) * child->length;
-            for (i = 0; i < nstates; i++) {
-                expchild[i] = exp(eval[c*nstates+i]*len_child);
-            }
-            for (x = 0; x < nstates; x++)
-                for (i = 0; i < nstates; i++) {
-                    echild[c*nstatesqr+x*nstates+i] = evec[c*nstatesqr+x*nstates+i] * expchild[i];
-                }
-        }
-        // pre compute information for tip
-        if (child->node->isLeaf()) {
-            vector<int>::iterator it;
-            for (it = aln->seq_states[child->node->id].begin(); it != aln->seq_states[child->node->id].end(); it++) {
-                int state = (*it);
-                for (c = 0; c < ncat; c++)
-                for (x = 0; x < nstates; x++) {
-                    double vchild = 0.0;
-                    for (i = 0; i < nstates; i++) {
-                        vchild += echild[c*nstatesqr+x*nstates+i] * tip_partial_lh[state*block+c*nstates+i];
-                    }
-                    partial_lh_leaf[state*block+c*nstates+x] = vchild;
-                }
-            }
-            size_t addr = aln->STATE_UNKNOWN * block;
-            for (x = 0; x < block; x++) {
-                partial_lh_leaf[addr+x] = 1.0;
-            }
-            partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
-        }
-        echild += block*nstates;
-    }
-
-    double *eleft = echildren, *eright = echildren + block*nstates;
-    
-	if (!left->node->isLeaf() && right->node->isLeaf()) {
-		PhyloNeighbor *tmp = left;
-		left = right;
-		right = tmp;
-        double *etmp = eleft;
-        eleft = eright;
-        eright = etmp;
-	}
-    
-    if (node->degree() > 3) {
-        /*--------------------- multifurcating node ------------------*/
-        double sum_scale = 0.0;
-    
-        // now for-loop computing partial_lh over all site-patterns
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
-#endif
-        for (ptn = 0; ptn < nptn; ptn++) {
-            double partial_lh_all[block];
-            for (i = 0; i < block; i++)
-                partial_lh_all[i] = 1.0;
-            dad_branch->scale_num[ptn] = 0;
-                
-            double *partial_lh_leaf = partial_lh_leaves;
-            double *echild = echildren;
-
-            FOR_NEIGHBOR_IT(node, dad, it) {
-                PhyloNeighbor *child = (PhyloNeighbor*)*it;
-                if (child->node->isLeaf()) {
-                    // external node
-                    int state_child = (ptn < orig_ntn) ? (aln->at(ptn))[child->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
-                    double *child_lh = partial_lh_leaf + state_child*block;
-                    for (c = 0; c < block; c++) {
-                        partial_lh_all[c] *= child_lh[c];
-                    }
-                    partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
-                } else {
-                    // internal node
-                    double *partial_lh = partial_lh_all;
-                    double *partial_lh_child = child->partial_lh + ptn*block;
-                    dad_branch->scale_num[ptn] += child->scale_num[ptn];
-
-                    double *echild_ptr = echild;
-                    for (c = 0; c < ncat; c++) {
-                        // compute real partial likelihood vector
-                        for (x = 0; x < nstates; x++) {
-                            double vchild = 0.0;
-                            for (i = 0; i < nstates; i++) {
-                                vchild += echild_ptr[i] * partial_lh_child[i];
-                            }
-                            echild_ptr += nstates;
-                            partial_lh[x] *= vchild;
-                        }
-                        partial_lh += nstates;
-                        partial_lh_child += nstates;
-                    }
-                } // if
-                echild += block*nstates;
-            } // FOR_NEIGHBOR
-
-
-            // compute dot-product with inv_eigenvector
-            double lh_max = 0.0;
-            double *partial_lh_tmp = partial_lh_all;
-            double *partial_lh = dad_branch->partial_lh + ptn*block;
-            double *inv_evec_ptr = inv_evec;
-			for (c = 0; c < ncat; c++) {
-				// compute dot-product with inv_eigenvector
-				for (i = 0; i < nstates; i++) {
-					double res = 0.0;
-					for (x = 0; x < nstates; x++) {
-						res += partial_lh_tmp[x]*inv_evec_ptr[x];
-					}
-                    inv_evec_ptr += nstates;
-					partial_lh[i] = res;
-                    lh_max = max(fabs(res), lh_max);
-				}
-                partial_lh += nstates;
-                partial_lh_tmp += nstates;
-			}
-
-            if (lh_max < SCALING_THRESHOLD) {
-				// now do the likelihood scaling
-                partial_lh = dad_branch->partial_lh + ptn*block;
-				for (i = 0; i < block; i++) {
-					partial_lh[i] *= SCALING_THRESHOLD_INVER;
-				}
-				// unobserved const pattern will never have underflow
-				sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
-				dad_branch->scale_num[ptn] += 1;
-            }
-            
-        } // for ptn
-        dad_branch->lh_scale_factor += sum_scale;
-                
-        // end multifurcating treatment
-
-	} else if (left->node->isLeaf() && right->node->isLeaf()) {
-		// special treatment for TIP-TIP (cherry) case
-
-		// pre compute information for both tips
-		double *partial_lh_left = partial_lh_leaves;
-		double *partial_lh_right = partial_lh_leaves + (aln->STATE_UNKNOWN+1)*block;
-
-//		vector<int>::iterator it;
-//		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
-//			int state = (*it);
-//			for (c = 0; c < ncat; c++)
-//			for (x = 0; x < nstates; x++) {
-//				double vleft = 0.0;
-//				for (i = 0; i < nstates; i++) {
-//					vleft += eleft[c*nstatesqr+x*nstates+i] * tip_partial_lh[state*block+c*nstates+i];
-//				}
-//				partial_lh_left[state*block+c*nstates+x] = vleft;
-//			}
-//		}
-//
-//		for (it = aln->seq_states[right->node->id].begin(); it != aln->seq_states[right->node->id].end(); it++) {
-//			int state = (*it);
-//			for (c = 0; c < ncat; c++)
-//			for (x = 0; x < nstates; x++) {
-//				double vright = 0.0;
-//				for (i = 0; i < nstates; i++) {
-//					vright += eright[c*nstatesqr+x*nstates+i] * tip_partial_lh[state*block+c*nstates+i];
-//				}
-//				partial_lh_right[state*block+c*nstates+x] = vright;
-//			}
-//		}
-//
-//		for (x = 0; x < block; x++) {
-//			size_t addr = aln->STATE_UNKNOWN * block;
-//			partial_lh_left[addr+x] = 1.0;
-//			partial_lh_right[addr+x] = 1.0;
-//		}
-//
-
-		// scale number must be ZERO
-	    memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
-#ifdef _OPENMP
-//#pragma omp parallel for private(ptn, c, x, i, partial_lh_tmp)
-#pragma omp parallel for private(ptn, c, x, i)
-#endif
-		for (ptn = 0; ptn < nptn; ptn++) {
-			double partial_lh_tmp[nstates];
-			double *partial_lh = dad_branch->partial_lh + ptn*block;
-			int state_left = (ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
-			int state_right = (ptn < orig_ntn) ? (aln->at(ptn))[right->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
-			for (c = 0; c < ncat; c++) {
-				// compute real partial likelihood vector
-				double *left = partial_lh_left + (state_left*block+c*nstates);
-				double *right = partial_lh_right + (state_right*block+c*nstates);
-				for (x = 0; x < nstates; x++) {
-					partial_lh_tmp[x] = left[x] * right[x];
-				}
-
-				// compute dot-product with inv_eigenvector
-				for (i = 0; i < nstates; i++) {
-					double res = 0.0;
-					for (x = 0; x < nstates; x++) {
-						res += partial_lh_tmp[x]*inv_evec[c*nstatesqr+i*nstates+x];
-					}
-					partial_lh[c*nstates+i] = res;
-				}
-			}
-		}
-//		delete [] partial_lh_right;
-//		delete [] partial_lh_left;
-	} else if (left->node->isLeaf() && !right->node->isLeaf()) {
-		// special treatment to TIP-INTERNAL NODE case
-		// only take scale_num from the right subtree
-		memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
-
-		// pre compute information for left tip
-		double *partial_lh_left = partial_lh_leaves;
-
-//		vector<int>::iterator it;
-//		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
-//			int state = (*it);
-//			for (c = 0; c < ncat; c++)
-//			for (x = 0; x < nstates; x++) {
-//				double vleft = 0.0;
-//				for (i = 0; i < nstates; i++) {
-//					vleft += eleft[c*nstatesqr+x*nstates+i] * tip_partial_lh[state*block+c*nstates+i];
-//				}
-//				partial_lh_left[state*block+c*nstates+x] = vleft;
-//			}
-//		}
-//		for (x = 0; x < block; x++) {
-//			size_t addr = aln->STATE_UNKNOWN * block;
-//			partial_lh_left[addr+x] = 1.0;
-//		}
-//
-
-		double sum_scale = 0.0;
-#ifdef _OPENMP
-//#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, partial_lh_tmp)
-#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i)
-#endif
-		for (ptn = 0; ptn < nptn; ptn++) {
-			double partial_lh_tmp[nstates];
-			double *partial_lh = dad_branch->partial_lh + ptn*block;
-			double *partial_lh_right = right->partial_lh + ptn*block;
-			int state_left = (ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
-            double lh_max = 0.0;
-
-			for (c = 0; c < ncat; c++) {
-				// compute real partial likelihood vector
-				for (x = 0; x < nstates; x++) {
-					double vleft = 0.0, vright = 0.0;
-					size_t addr = c*nstatesqr+x*nstates;
-					vleft = partial_lh_left[state_left*block+c*nstates+x];
-					for (i = 0; i < nstates; i++) {
-						vright += eright[addr+i] * partial_lh_right[c*nstates+i];
-					}
-					partial_lh_tmp[x] = vleft * (vright);
-				}
-				// compute dot-product with inv_eigenvector
-				for (i = 0; i < nstates; i++) {
-					double res = 0.0;
-					for (x = 0; x < nstates; x++) {
-						res += partial_lh_tmp[x]*inv_evec[c*nstatesqr+i*nstates+x];
-					}
-					partial_lh[c*nstates+i] = res;
-                    lh_max = max(fabs(res), lh_max);
-				}
-			}
-            if (lh_max < SCALING_THRESHOLD) {
-				// now do the likelihood scaling
-				for (i = 0; i < block; i++) {
-					partial_lh[i] *= SCALING_THRESHOLD_INVER;
-				}
-				// unobserved const pattern will never have underflow
-				sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
-				dad_branch->scale_num[ptn] += 1;
-            }
-
-
-		}
-		dad_branch->lh_scale_factor += sum_scale;
-//		delete [] partial_lh_left;
-
-	} else {
-		// both left and right are internal node
-
-		double sum_scale = 0.0;
-#ifdef _OPENMP
-//#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, partial_lh_tmp)
-#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i)
-#endif
-		for (ptn = 0; ptn < nptn; ptn++) {
-			double partial_lh_tmp[nstates];
-			double *partial_lh = dad_branch->partial_lh + ptn*block;
-			double *partial_lh_left = left->partial_lh + ptn*block;
-			double *partial_lh_right = right->partial_lh + ptn*block;
-            double lh_max = 0.0;
-			dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
-
-			for (c = 0; c < ncat; c++) {
-				// compute real partial likelihood vector
-				for (x = 0; x < nstates; x++) {
-					double vleft = 0.0, vright = 0.0;
-					size_t addr = c*nstatesqr+x*nstates;
-					for (i = 0; i < nstates; i++) {
-						vleft += eleft[addr+i] * partial_lh_left[c*nstates+i];
-						vright += eright[addr+i] * partial_lh_right[c*nstates+i];
-					}
-					partial_lh_tmp[x] = vleft*vright;
-				}
-				// compute dot-product with inv_eigenvector
-				for (i = 0; i < nstates; i++) {
-					double res = 0.0;
-					for (x = 0; x < nstates; x++) {
-						res += partial_lh_tmp[x]*inv_evec[c*nstatesqr+i*nstates+x];
-					}
-					partial_lh[c*nstates+i] = res;
-                    lh_max = max(lh_max, fabs(res));
-				}
-			}
-            if (lh_max < SCALING_THRESHOLD) {
-				// now do the likelihood scaling
-				for (i = 0; i < block; i++) {
-                    partial_lh[i] *= SCALING_THRESHOLD_INVER;
-				}
-				// unobserved const pattern will never have underflow
-                sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
-				dad_branch->scale_num[ptn] += 1;
-            }
-
-		}
-		dad_branch->lh_scale_factor += sum_scale;
-
-	}
-
-    delete [] partial_lh_leaves;
-    delete [] echildren;
-}
-
-//template <const int nstates>
-void PhyloTree::computeMixrateLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
-    PhyloNode *node = (PhyloNode*) dad_branch->node;
-    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
-    if (!central_partial_lh)
-        initializeAllPartialLh();
-    if (node->isLeaf()) {
-    	PhyloNode *tmp_node = dad;
-    	dad = node;
-    	node = tmp_node;
-    	PhyloNeighbor *tmp_nei = dad_branch;
-    	dad_branch = node_branch;
-    	node_branch = tmp_nei;
-    }
-    if ((dad_branch->partial_lh_computed & 1) == 0)
-        computeMixratePartialLikelihoodEigen(dad_branch, dad);
-    if ((node_branch->partial_lh_computed & 1) == 0)
-        computeMixratePartialLikelihoodEigen(node_branch, node);
-    size_t nstates = aln->num_states;
-    size_t ncat = site_rate->getNRate();
-
-    size_t block = ncat * nstates;
-    size_t ptn; // for big data size > 4GB memory required
-    size_t c, i;
-    size_t orig_nptn = aln->size();
-    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
-    double *eval = model->getEigenvalues();
-    assert(eval);
-
-	assert(theta_all);
-	if (!theta_computed) {
-		// precompute theta for fast branch length optimization
-
-	    if (dad->isLeaf()) {
-	    	// special treatment for TIP-INTERNAL NODE case
-#ifdef _OPENMP
-#pragma omp parallel for private(ptn, i)
-#endif
-	    	for (ptn = 0; ptn < nptn; ptn++) {
-				double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
-				double *theta = theta_all + ptn*block;
-				double *lh_tip = tip_partial_lh + ((int)((ptn < orig_nptn) ? (aln->at(ptn))[dad->id] :  model_factory->unobserved_ptns[ptn-orig_nptn]))*nstates*ncat;
-				for (i = 0; i < block; i++) {
-					theta[i] = lh_tip[i] * partial_lh_dad[i];
-				}
-
-			}
-			// ascertainment bias correction
-	    } else {
-	    	// both dad and node are internal nodes
-		    double *partial_lh_node = node_branch->partial_lh;
-		    double *partial_lh_dad = dad_branch->partial_lh;
-
-	    	size_t all_entries = nptn*block;
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-	    	for (i = 0; i < all_entries; i++) {
-				theta_all[i] = partial_lh_node[i] * partial_lh_dad[i];
-			}
-	    }
-		theta_computed = true;
-	}
-
-    double *val0 = new double[block];
-    double *val1 = new double[block];
-    double *val2 = new double[block];
-	for (c = 0; c < ncat; c++) {
-		double prop = site_rate->getProp(c);
-		for (i = 0; i < nstates; i++) {
-			double cof = eval[c*nstates+i]*site_rate->getRate(c);
-			double val = exp(cof*dad_branch->length) * prop;
-			double val1_ = cof*val;
-			val0[c*nstates+i] = val;
-			val1[c*nstates+i] = val1_;
-			val2[c*nstates+i] = cof*val1_;
-		}
-	}
-
-
-    double my_df = 0.0, my_ddf = 0.0, prob_const = 0.0, df_const = 0.0, ddf_const = 0.0;
-
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+: my_df, my_ddf, prob_const, df_const, ddf_const) private(ptn, i)
-#endif
-    for (ptn = 0; ptn < nptn; ptn++) {
-		double lh_ptn = ptn_invar[ptn], df_ptn = 0.0, ddf_ptn = 0.0;
-		double *theta = theta_all + ptn*block;
-		for (i = 0; i < block; i++) {
-			lh_ptn += val0[i] * theta[i];
-			df_ptn += val1[i] * theta[i];
-			ddf_ptn += val2[i] * theta[i];
-		}
-
-//        assert(lh_ptn > 0.0);
-        lh_ptn = fabs(lh_ptn);
-
-        if (ptn < orig_nptn) {
-			double df_frac = df_ptn / lh_ptn;
-			double ddf_frac = ddf_ptn / lh_ptn;
-			double freq = ptn_freq[ptn];
-			double tmp1 = df_frac * freq;
-			double tmp2 = ddf_frac * freq;
-			my_df += tmp1;
-			my_ddf += tmp2 - tmp1 * df_frac;
-		} else {
-			// ascertainment bias correction
-			prob_const += lh_ptn;
-			df_const += df_ptn;
-			ddf_const += ddf_ptn;
-		}
-    }
-	df = my_df;
-	ddf = my_ddf;
-    if (isnan(df) || isinf(df)) {
-        df = 0.0;
-        ddf = 0.0;
-//        outWarning("Numerical instability (some site-likelihood = 0)");
-    }
-
-	if (orig_nptn < nptn) {
-    	// ascertainment bias correction
-    	prob_const = 1.0 - prob_const;
-    	double df_frac = df_const / prob_const;
-    	double ddf_frac = ddf_const / prob_const;
-    	int nsites = aln->getNSite();
-    	df += nsites * df_frac;
-    	ddf += nsites *(ddf_frac + df_frac*df_frac);
-    }
-
-
-    delete [] val2;
-    delete [] val1;
-    delete [] val0;
-}
-
-//template <const int nstates>
-double PhyloTree::computeMixrateLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad) {
-    PhyloNode *node = (PhyloNode*) dad_branch->node;
-    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
-    if (!central_partial_lh)
-        initializeAllPartialLh();
-    if (node->isLeaf()) {
-    	PhyloNode *tmp_node = dad;
-    	dad = node;
-    	node = tmp_node;
-    	PhyloNeighbor *tmp_nei = dad_branch;
-    	dad_branch = node_branch;
-    	node_branch = tmp_nei;
-    }
-    if ((dad_branch->partial_lh_computed & 1) == 0)
-//        computeMixratePartialLikelihoodEigen(dad_branch, dad);
-        computePartialLikelihood(dad_branch, dad);
-    if ((node_branch->partial_lh_computed & 1) == 0)
-//        computeMixratePartialLikelihoodEigen(node_branch, node);
-        computePartialLikelihood(node_branch, node);
-    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
-    size_t nstates = aln->num_states;
-    size_t ncat = site_rate->getNRate();
-
-    size_t block = ncat * nstates;
-    size_t ptn; // for big data size > 4GB memory required
-    size_t c, i;
-    size_t orig_nptn = aln->size();
-    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
-    double *eval = model->getEigenvalues();
-    assert(eval);
-
-    double *val = new double[block];
-	for (c = 0; c < ncat; c++) {
-		double len = site_rate->getRate(c)*dad_branch->length;
-		double prop = site_rate->getProp(c);
-		for (i = 0; i < nstates; i++)
-			val[c*nstates+i] = exp(eval[c*nstates+i]*len) * prop;
-	}
-
-	double prob_const = 0.0;
-	memset(_pattern_lh_cat, 0, nptn*ncat*sizeof(double));
-
-    if (dad->isLeaf()) {
-    	// special treatment for TIP-INTERNAL NODE case
-    	double *partial_lh_node = new double[(aln->STATE_UNKNOWN+1)*block];
-    	IntVector states_dad = aln->seq_states[dad->id];
-    	states_dad.push_back(aln->STATE_UNKNOWN);
-    	// precompute information from one tip
-    	for (IntVector::iterator it = states_dad.begin(); it != states_dad.end(); it++) {
-    		double *lh_node = partial_lh_node +(*it)*block;
-    		double *lh_tip = tip_partial_lh + (*it)*block;
-    		for (i = 0; i < block; i++)
-    			lh_node[i] = val[i]*lh_tip[i];
-    	}
-
-    	// now do the real computation
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+: tree_lh, prob_const) private(ptn, i, c)
-#endif
-    	for (ptn = 0; ptn < nptn; ptn++) {
-			double lh_ptn = ptn_invar[ptn];
-			double *lh_cat = _pattern_lh_cat + ptn*ncat;
-			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
-			int state_dad = (ptn < orig_nptn) ? (aln->at(ptn))[dad->id] : model_factory->unobserved_ptns[ptn-orig_nptn];
-			double *lh_node = partial_lh_node + state_dad*block;
-			for (c = 0; c < ncat; c++) {
-				for (i = 0; i < nstates; i++) {
-					*lh_cat += lh_node[i] * partial_lh_dad[i];
-				}
-				lh_node += nstates;
-				partial_lh_dad += nstates;
-				lh_ptn += *lh_cat;
-				lh_cat++;
-			}
-//			assert(lh_ptn > 0.0);
-			if (ptn < orig_nptn) {
-				lh_ptn = log(fabs(lh_ptn));
-				_pattern_lh[ptn] = lh_ptn;
-				tree_lh += lh_ptn * ptn_freq[ptn];
-			} else {
-                // bugfix 2016-01-21, prob_const can be rescaled
-                if (dad_branch->scale_num[ptn] >= 1)
-                    lh_ptn *= SCALING_THRESHOLD;
-				prob_const += lh_ptn;
-			}
-		}
-		delete [] partial_lh_node;
-    } else {
-    	// both dad and node are internal nodes
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+: tree_lh, prob_const) private(ptn, i, c)
-#endif
-    	for (ptn = 0; ptn < nptn; ptn++) {
-			double lh_ptn = ptn_invar[ptn];
-			double *lh_cat = _pattern_lh_cat + ptn*ncat;
-			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
-			double *partial_lh_node = node_branch->partial_lh + ptn*block;
-			double *val_tmp = val;
-			for (c = 0; c < ncat; c++) {
-				for (i = 0; i < nstates; i++) {
-					*lh_cat +=  val_tmp[i] * partial_lh_node[i] * partial_lh_dad[i];
-				}
-				lh_ptn += *lh_cat;
-				partial_lh_node += nstates;
-				partial_lh_dad += nstates;
-				val_tmp += nstates;
-				lh_cat++;
-			}
-
-			assert(lh_ptn > 0.0);
-            if (ptn < orig_nptn) {
-				lh_ptn = log(lh_ptn);
-				_pattern_lh[ptn] = lh_ptn;
-				tree_lh += lh_ptn * ptn_freq[ptn];
-			} else {
-                // bugfix 2016-01-21, prob_const can be rescaled
-                if (dad_branch->scale_num[ptn] + node_branch->scale_num[ptn] >= 1)
-                    lh_ptn *= SCALING_THRESHOLD;
-				prob_const += lh_ptn;
-			}
-		}
-    }
-
-
-    if (orig_nptn < nptn) {
-    	// ascertainment bias correction
-    	prob_const = log(1.0 - prob_const);
-    	for (ptn = 0; ptn < orig_nptn; ptn++)
-    		_pattern_lh[ptn] -= prob_const;
-    	tree_lh -= aln->getNSite()*prob_const;
-		assert(!isnan(tree_lh) && !isinf(tree_lh));
-    }
-
-	assert(!isnan(tree_lh) && !isinf(tree_lh));
-
-    delete [] val;
-    return tree_lh;
-}
-
-
-/*******************************************************
- *
- * non-vectorized likelihood functions for mixture models
- *
- ******************************************************/
-
-//template <const int nstates>
-void PhyloTree::computeMixturePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad) {
-    // don't recompute the likelihood
-	assert(dad);
-    if (dad_branch->partial_lh_computed & 1)
-        return;
-    dad_branch->partial_lh_computed |= 1;
-
-    size_t nstates = aln->num_states;
-    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
-    PhyloNode *node = (PhyloNode*)(dad_branch->node);
-
-    if (!tip_partial_lh_computed)
-        computeTipPartialLikelihood();
-
-	if (node->isLeaf()) {
-	    dad_branch->lh_scale_factor = 0.0;
-		return;
-	}
-
-    size_t ptn, c;
-    size_t orig_ntn = aln->size();
-    size_t ncat = site_rate->getNRate(), nmixture = model->getNMixtures();
-    const size_t nstatesqr=nstates*nstates;
-    size_t i, x, m;
-    size_t statecat = nstates * ncat;
-//    size_t statemix = nstates * nmixture;
-    size_t block = nstates * ncat * nmixture;
-
-	double *evec = model->getEigenvectors();
-	double *inv_evec = model->getInverseEigenvectors();
-	assert(inv_evec && evec);
-	double *eval = model->getEigenvalues();
-
-    dad_branch->lh_scale_factor = 0.0;
-
-	// internal node
-	PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
-	FOR_NEIGHBOR_IT(node, dad, it) {
-        PhyloNeighbor *nei = (PhyloNeighbor*)*it;
-		if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
-        if ((nei->partial_lh_computed & 1) == 0)
-            computePartialLikelihood(nei, node);
-        dad_branch->lh_scale_factor += nei->lh_scale_factor;
-	}
-
-    if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
-        // re-orient partial_lh
-        bool done = false;
-        FOR_NEIGHBOR_IT(node, dad, it2) {
-            PhyloNeighbor *backnei = ((PhyloNeighbor*)(*it2)->node->findNeighbor(node));
-            if (backnei->partial_lh) {
-                dad_branch->partial_lh = backnei->partial_lh;
-                dad_branch->scale_num = backnei->scale_num;
-                backnei->partial_lh = NULL;
-                backnei->scale_num = NULL;
-                backnei->partial_lh_computed &= ~1; // clear bit
-                done = true;
-                break;
-            }
-        }
-        assert(done && "partial_lh is not re-oriented");
-    }
-
-        
-    double *echildren = new double[block*nstates*(node->degree()-1)];
-    double *partial_lh_leaves = new double[(aln->STATE_UNKNOWN+1)*block*(node->degree()-1)];
-    double *echild = echildren;
-    double *partial_lh_leaf = partial_lh_leaves;
-
-    FOR_NEIGHBOR_IT(node, dad, it) {
-        // precompute information buffer
-        double expchild[nstates];
-        PhyloNeighbor *child = (PhyloNeighbor*)*it;
-        for (c = 0; c < ncat; c++) {
-            double len_child = site_rate->getRate(c) * child->length;
-            for (m = 0; m < nmixture; m++) {
-                for (i = 0; i < nstates; i++) {
-                    expchild[i] = exp(eval[m*nstates+i]*len_child);
-                }
-                for (x = 0; x < nstates; x++)
-                    for (i = 0; i < nstates; i++) {
-                        echild[(m*ncat+c)*nstatesqr+x*nstates+i] = evec[m*nstatesqr+x*nstates+i] * expchild[i];
-                    }
-            }
-        }
-        if (child->node->isLeaf()) {
-            vector<int>::iterator it;
-            for (it = aln->seq_states[child->node->id].begin(); it != aln->seq_states[child->node->id].end(); it++) {
-                int state = (*it);
-                for (m = 0; m < nmixture; m++) {
-                    double *this_echild = &echild[m*nstatesqr*ncat];
-                    double *this_tip_partial_lh = &tip_partial_lh[state*nstates*nmixture + m*nstates];
-                    double *this_partial_lh_leaf = &partial_lh_leaf[state*block+m*statecat];
-                    for (x = 0; x < statecat; x++) {
-                        double vchild = 0.0;
-                        for (i = 0; i < nstates; i++) {
-                            vchild += this_echild[x*nstates+i] * this_tip_partial_lh[i];
-                        }
-                        this_partial_lh_leaf[x] = vchild;
-                    }
-                }
-            }
-            size_t addr = aln->STATE_UNKNOWN * block;
-            for (x = 0; x < block; x++) {
-                partial_lh_leaf[addr+x] = 1.0;
-            }
-            partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
-        }
-        echild += block*nstates;
-    }
-
-    double *eleft = echildren, *eright = echildren + block*nstates;
-    
-	if (!left->node->isLeaf() && right->node->isLeaf()) {
-		PhyloNeighbor *tmp = left;
-		left = right;
-		right = tmp;
-        double *etmp = eleft;
-        eleft = eright;
-        eright = etmp;
-	}
-    
-    if (node->degree() > 3) {
-        /*--------------------- multifurcating node ------------------*/
-        // now for-loop computing partial_lh over all site-patterns
-        double sum_scale = 0.0;
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, m) schedule(static)
-#endif
-        for (ptn = 0; ptn < nptn; ptn++) {
-            double partial_lh_all[block];
-            for (i = 0; i < block; i++)
-                partial_lh_all[i] = 1.0;
-            dad_branch->scale_num[ptn] = 0;
-                
-            double *partial_lh_leaf = partial_lh_leaves;
-            double *echild = echildren;
-
-            FOR_NEIGHBOR_IT(node, dad, it) {
-                PhyloNeighbor *child = (PhyloNeighbor*)*it;
-                if (child->node->isLeaf()) {
-                    // external node
-                    int state_child = (ptn < orig_ntn) ? (aln->at(ptn))[child->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
-                    double *child_lh = partial_lh_leaf + state_child*block;
-                    for (c = 0; c < block; c++) {
-                        // compute real partial likelihood vector
-                        partial_lh_all[c] *= child_lh[c];
-                    }
-                    partial_lh_leaf += (aln->STATE_UNKNOWN+1)*block;
-                } else {
-                    // internal node
-                    double *partial_lh = partial_lh_all;
-                    double *partial_lh_child = child->partial_lh + ptn*block;
-                    dad_branch->scale_num[ptn] += child->scale_num[ptn];
-                    double *echild_ptr = echild;
-
-                    for (m = 0; m < nmixture; m++) {
-                        for (c = 0; c < ncat; c++) {
-                            // compute real partial likelihood vector
-                            for (x = 0; x < nstates; x++) {
-                                double vchild = 0.0;
-//                                size_t addr = (m*ncat+c)*nstatesqr+x*nstates;
-                                for (i = 0; i < nstates; i++) {
-                                    vchild += echild_ptr[i] * partial_lh_child[i];
-                                }
-                                echild_ptr += nstates;
-                                partial_lh[x] *= vchild;
-                            }
-                            partial_lh += nstates;
-                            partial_lh_child += nstates;
-                        }
-                    }
-
-                } // if
-                echild += block*nstates;
-            } // FOR_NEIGHBOR
-
-            // compute dot-product with inv_eigenvector
-            double lh_max = 0.0;
-            double *partial_lh_tmp = partial_lh_all;
-            double *partial_lh = dad_branch->partial_lh+ptn*block;
-            for (m = 0; m < nmixture; m++) {
-				for (c = 0; c < ncat; c++) {
-                    double *inv_evec_ptr = inv_evec + m*nstatesqr;
-					for (i = 0; i < nstates; i++) {
-						double res = 0.0;
-						for (x = 0; x < nstates; x++) {
-							res += partial_lh_tmp[x]*inv_evec_ptr[x];
-						}
-                        inv_evec_ptr += nstates;
-						partial_lh[i] = res;
-						lh_max = max(fabs(res), lh_max);
-					}
-                    partial_lh += nstates;
-                    partial_lh_tmp += nstates;
-				}
-            }
-            if (lh_max < SCALING_THRESHOLD) {
-				// now do the likelihood scaling
-                partial_lh = dad_branch->partial_lh + ptn*block;
-				for (i = 0; i < block; i++) {
-					partial_lh[i] *= SCALING_THRESHOLD_INVER;
-				}
-				// unobserved const pattern will never have underflow
-				sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
-				dad_branch->scale_num[ptn] += 1;
-            }
-
-        } // for ptn
-        dad_branch->lh_scale_factor += sum_scale;               
-                
-        // end multifurcating treatment
-        
-	} else if (left->node->isLeaf() && right->node->isLeaf()) {
-		// special treatment for TIP-TIP (cherry) case
-
-		// pre compute information for both tips
-		double *partial_lh_left = partial_lh_leaves;
-		double *partial_lh_right = partial_lh_leaves + (aln->STATE_UNKNOWN+1)*block;
-
-		// scale number must be ZERO
-	    memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
-#ifdef _OPENMP
-#pragma omp parallel for private(ptn, c, x, i, m)
-#endif
-		for (ptn = 0; ptn < nptn; ptn++) {
-			double partial_lh_tmp[nstates];
-			double *partial_lh = dad_branch->partial_lh + ptn*block;
-			int state_left = (ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
-			int state_right = (ptn < orig_ntn) ? (aln->at(ptn))[right->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
-			for (m = 0; m < nmixture; m++) {
-				for (c = 0; c < ncat; c++) {
-					// compute real partial likelihood vector
-					double *left = partial_lh_left + (state_left*block+m*statecat+c*nstates);
-					double *right = partial_lh_right + (state_right*block+m*statecat+c*nstates);
-					for (x = 0; x < nstates; x++) {
-						partial_lh_tmp[x] = left[x] * right[x];
-					}
-
-					// compute dot-product with inv_eigenvector
-					for (i = 0; i < nstates; i++) {
-						double res = 0.0;
-						for (x = 0; x < nstates; x++) {
-							res += partial_lh_tmp[x]*inv_evec[m*nstatesqr+i*nstates+x];
-						}
-						partial_lh[m*statecat+c*nstates+i] = res;
-					}
-				}
-			}
-		}
-	} else if (left->node->isLeaf() && !right->node->isLeaf()) {
-		// special treatment to TIP-INTERNAL NODE case
-		// only take scale_num from the right subtree
-		memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
-
-		// pre compute information for left tip
-		double *partial_lh_left = partial_lh_leaves;
-
-		double sum_scale = 0.0;
-#ifdef _OPENMP
-//#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, m, partial_lh_tmp)
-#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, m)
-#endif
-		for (ptn = 0; ptn < nptn; ptn++) {
-			double partial_lh_tmp[nstates];
-			double *partial_lh = dad_branch->partial_lh + ptn*block;
-			double *partial_lh_right = right->partial_lh + ptn*block;
-			int state_left = (ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
-            double lh_max = 0.0;
-
-            for (m = 0; m < nmixture; m++) {
-				for (c = 0; c < ncat; c++) {
-					// compute real partial likelihood vector
-					for (x = 0; x < nstates; x++) {
-						double vleft = 0.0, vright = 0.0;
-						size_t addr = (m*ncat+c)*nstatesqr+x*nstates;
-						vleft = partial_lh_left[state_left*block+m*statecat+c*nstates+x];
-						for (i = 0; i < nstates; i++) {
-							vright += eright[addr+i] * partial_lh_right[m*statecat+c*nstates+i];
-						}
-						partial_lh_tmp[x] = vleft * (vright);
-					}
-					// compute dot-product with inv_eigenvector
-					for (i = 0; i < nstates; i++) {
-						double res = 0.0;
-						for (x = 0; x < nstates; x++) {
-							res += partial_lh_tmp[x]*inv_evec[m*nstatesqr+i*nstates+x];
-						}
-						partial_lh[m*statecat+c*nstates+i] = res;
-						lh_max = max(fabs(res), lh_max);
-					}
-				}
-            }
-            if (lh_max < SCALING_THRESHOLD) {
-				// now do the likelihood scaling
-				for (i = 0; i < block; i++) {
-					partial_lh[i] *= SCALING_THRESHOLD_INVER;
-				}
-				// unobserved const pattern will never have underflow
-				sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
-				dad_branch->scale_num[ptn] += 1;
-            }
-
-
-		}
-		dad_branch->lh_scale_factor += sum_scale;
-
-	} else {
-		// both left and right are internal node
-
-		double sum_scale = 0.0;
-#ifdef _OPENMP
-//#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, m, partial_lh_tmp)
-#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, m)
-#endif
-		for (ptn = 0; ptn < nptn; ptn++) {
-			double partial_lh_tmp[nstates];
-			double *partial_lh = dad_branch->partial_lh + ptn*block;
-			double *partial_lh_left = left->partial_lh + ptn*block;
-			double *partial_lh_right = right->partial_lh + ptn*block;
-            double lh_max = 0.0;
-			dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
-
-			for (m = 0; m < nmixture; m++) {
-				for (c = 0; c < ncat; c++) {
-					// compute real partial likelihood vector
-					for (x = 0; x < nstates; x++) {
-						double vleft = 0.0, vright = 0.0;
-						size_t addr = (m*ncat+c)*nstatesqr+x*nstates;
-						for (i = 0; i < nstates; i++) {
-							vleft += eleft[addr+i] * partial_lh_left[m*statecat+c*nstates+i];
-							vright += eright[addr+i] * partial_lh_right[m*statecat+c*nstates+i];
-						}
-						partial_lh_tmp[x] = vleft*vright;
-					}
-					// compute dot-product with inv_eigenvector
-					for (i = 0; i < nstates; i++) {
-						double res = 0.0;
-						for (x = 0; x < nstates; x++) {
-							res += partial_lh_tmp[x]*inv_evec[m*nstatesqr+i*nstates+x];
-						}
-						partial_lh[m*statecat+c*nstates+i] = res;
-						lh_max = max(lh_max, fabs(res));
-					}
-				}
-			}
-            if (lh_max < SCALING_THRESHOLD) {
-				// now do the likelihood scaling
-				for (i = 0; i < block; i++) {
-                    partial_lh[i] *= SCALING_THRESHOLD_INVER;
-				}
-				// unobserved const pattern will never have underflow
-                sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
-				dad_branch->scale_num[ptn] += 1;
-            }
-
-		}
-		dad_branch->lh_scale_factor += sum_scale;
-
-	}
-    
-    delete [] partial_lh_leaves;
-	delete [] echildren;
-}
-
-//template <const int nstates>
-void PhyloTree::computeMixtureLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
-    PhyloNode *node = (PhyloNode*) dad_branch->node;
-    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
-    if (!central_partial_lh)
-        initializeAllPartialLh();
-    if (node->isLeaf()) {
-    	PhyloNode *tmp_node = dad;
-    	dad = node;
-    	node = tmp_node;
-    	PhyloNeighbor *tmp_nei = dad_branch;
-    	dad_branch = node_branch;
-    	node_branch = tmp_nei;
-    }
-    if ((dad_branch->partial_lh_computed & 1) == 0)
-        computeMixturePartialLikelihoodEigen(dad_branch, dad);
-    if ((node_branch->partial_lh_computed & 1) == 0)
-        computeMixturePartialLikelihoodEigen(node_branch, node);
-    size_t nstates = aln->num_states;
-    size_t ncat = site_rate->getNRate();
-    size_t nmixture = model->getNMixtures();
-
-    size_t block = ncat * nstates * nmixture;
-    size_t statemix = nstates * nmixture;
-    size_t statecat = nstates * ncat;
-    size_t ptn; // for big data size > 4GB memory required
-    size_t c, i, m;
-    size_t orig_nptn = aln->size();
-    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
-    double *eval = model->getEigenvalues();
-    assert(eval);
-
-	assert(theta_all);
-	if (!theta_computed) {
-		// precompute theta for fast branch length optimization
-
-	    if (dad->isLeaf()) {
-	    	// special treatment for TIP-INTERNAL NODE case
-#ifdef _OPENMP
-#pragma omp parallel for private(ptn, i, m)
-#endif
-	    	for (ptn = 0; ptn < nptn; ptn++) {
-				double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
-				double *theta = theta_all + ptn*block;
-				double *lh_tip = tip_partial_lh +
-						((int)((ptn < orig_nptn) ? (aln->at(ptn))[dad->id] :  model_factory->unobserved_ptns[ptn-orig_nptn]))*statemix;
-				for (m = 0; m < nmixture; m++) {
-					for (i = 0; i < statecat; i++) {
-						theta[m*statecat+i] = lh_tip[m*nstates + i%nstates] * partial_lh_dad[m*statecat+i];
-					}
-				}
-
-			}
-			// ascertainment bias correction
-	    } else {
-	    	// both dad and node are internal nodes
-		    double *partial_lh_node = node_branch->partial_lh;
-		    double *partial_lh_dad = dad_branch->partial_lh;
-
-	    	size_t all_entries = nptn*block;
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-	    	for (i = 0; i < all_entries; i++) {
-				theta_all[i] = partial_lh_node[i] * partial_lh_dad[i];
-			}
-	    }
-		theta_computed = true;
-	}
-
-    double *val0 = new double[block];
-    double *val1 = new double[block];
-    double *val2 = new double[block];
-	for (c = 0; c < ncat; c++) {
-		double prop = site_rate->getProp(c);
-		for (m = 0; m < nmixture; m++) {
-			for (i = 0; i < nstates; i++) {
-				double cof = eval[m*nstates+i]*site_rate->getRate(c);
-				double val = exp(cof*dad_branch->length) * prop * ((ModelMixture*)model)->prop[m];
-				double val1_ = cof*val;
-				val0[(m*ncat+c)*nstates+i] = val;
-				val1[(m*ncat+c)*nstates+i] = val1_;
-				val2[(m*ncat+c)*nstates+i] = cof*val1_;
-			}
-		}
-	}
-
-
-    double my_df = 0.0, my_ddf = 0.0, prob_const = 0.0, df_const = 0.0, ddf_const = 0.0;
-
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+: my_df, my_ddf, prob_const, df_const, ddf_const) private(ptn, i)
-#endif
-    for (ptn = 0; ptn < nptn; ptn++) {
-		double lh_ptn = ptn_invar[ptn], df_ptn = 0.0, ddf_ptn = 0.0;
-		double *theta = theta_all + ptn*block;
-		for (i = 0; i < block; i++) {
-			lh_ptn += val0[i] * theta[i];
-			df_ptn += val1[i] * theta[i];
-			ddf_ptn += val2[i] * theta[i];
-		}
-
-//        assert(lh_ptn > 0.0);
-        lh_ptn = fabs(lh_ptn);
-
-        if (ptn < orig_nptn) {
-			double df_frac = df_ptn / lh_ptn;
-			double ddf_frac = ddf_ptn / lh_ptn;
-			double freq = ptn_freq[ptn];
-			double tmp1 = df_frac * freq;
-			double tmp2 = ddf_frac * freq;
-			my_df += tmp1;
-			my_ddf += tmp2 - tmp1 * df_frac;
-		} else {
-			// ascertainment bias correction
-			prob_const += lh_ptn;
-			df_const += df_ptn;
-			ddf_const += ddf_ptn;
-		}
-    }
-	df = my_df;
-	ddf = my_ddf;
-    if (isnan(df) || isinf(df)) {
-        df = 0.0;
-        ddf = 0.0;
-//        outWarning("Numerical instability (some site-likelihood = 0)");
-    }
-
-	if (orig_nptn < nptn) {
-    	// ascertainment bias correction
-    	prob_const = 1.0 - prob_const;
-    	double df_frac = df_const / prob_const;
-    	double ddf_frac = ddf_const / prob_const;
-    	int nsites = aln->getNSite();
-    	df += nsites * df_frac;
-    	ddf += nsites *(ddf_frac + df_frac*df_frac);
-    }
-
-
-    delete [] val2;
-    delete [] val1;
-    delete [] val0;
-}
-
-//template <const int nstates>
-double PhyloTree::computeMixtureLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad) {
-    PhyloNode *node = (PhyloNode*) dad_branch->node;
-    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
-    if (!central_partial_lh)
-        initializeAllPartialLh();
-    if (node->isLeaf()) {
-    	PhyloNode *tmp_node = dad;
-    	dad = node;
-    	node = tmp_node;
-    	PhyloNeighbor *tmp_nei = dad_branch;
-    	dad_branch = node_branch;
-    	node_branch = tmp_nei;
     }
-    if ((dad_branch->partial_lh_computed & 1) == 0)
-//        computeMixturePartialLikelihoodEigen(dad_branch, dad);
-        computePartialLikelihood(dad_branch, dad);
-    if ((node_branch->partial_lh_computed & 1) == 0)
-        computePartialLikelihood(node_branch, node);
-    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
-    size_t nstates = aln->num_states;
-    size_t ncat = site_rate->getNRate();
-    size_t nmixture = model->getNMixtures();
-
-    size_t block = ncat * nstates * nmixture;
-    size_t statemix = nstates * nmixture;
-    size_t catmix = ncat * nmixture;
-    size_t ptn; // for big data size > 4GB memory required
-    size_t c, i, m;
-    size_t orig_nptn = aln->size();
-    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
-    double *eval = model->getEigenvalues();
-    assert(eval);
 
-    double *val = new double[block];
-	for (c = 0; c < ncat; c++) {
-		double len = site_rate->getRate(c)*dad_branch->length;
-		double prop = site_rate->getProp(c);
-		for (m = 0; m < nmixture; m++)
-			for (i = 0; i < nstates; i++)
-				val[(m*ncat+c)*nstates+i] = exp(eval[m*nstates+i]*len) * prop * ((ModelMixture*)model)->prop[m];
-	}
 
-	double prob_const = 0.0;
-    // 2015-11-30: _pattern_lh_cat now stores mixture and cat likelihoods
-	memset(_pattern_lh_cat, 0, nptn*catmix*sizeof(double));
+	memset(ptn_ancestral_prob, 0, sizeof(double)*nptn*nstates);
 
     if (dad->isLeaf()) {
     	// special treatment for TIP-INTERNAL NODE case
-    	double *partial_lh_node = new double[(aln->STATE_UNKNOWN+1)*block];
-    	IntVector states_dad = aln->seq_states[dad->id];
-    	states_dad.push_back(aln->STATE_UNKNOWN);
-    	// precompute information from one tip
-    	for (IntVector::iterator it = states_dad.begin(); it != states_dad.end(); it++) {
-    		double *lh_node = partial_lh_node +(*it)*block;
-    		double *lh_tip = tip_partial_lh + (*it)*statemix;
-    		double *val_tmp = val;
-			for (m = 0; m < nmixture; m++) {
-				for (c = 0; c < ncat; c++) {
-					for (i = 0; i < nstates; i++) {
-						  lh_node[i] = val_tmp[i] * lh_tip[m*nstates+i];
-					}
-					lh_node += nstates;
-					val_tmp += nstates;
-				}
-			}
-    	}
+        double partial_lh_leaf[(aln->STATE_UNKNOWN+1)*block];
+
+        for (IntVector::iterator it = aln->seq_states[dad->id].begin(); it != aln->seq_states[dad->id].end(); it++) {
+            int state = (*it);
+            for (m = 0; m < nmixture; m++) {
+                double *this_echild = &echild[m*nstatesqr*ncat];
+                double *this_tip_partial_lh = &tip_partial_lh[state*nstates*nmixture + m*nstates];
+                double *this_partial_lh_leaf = &partial_lh_leaf[state*block+m*statecat];
+                for (x = 0; x < statecat; x++) {
+                    double vchild = 0.0;
+                    for (i = 0; i < nstates; i++) {
+                        vchild += this_echild[x*nstates+i] * this_tip_partial_lh[i];
+                    }
+                    this_partial_lh_leaf[x] = vchild;
+                }
+            }
+        }
+        size_t addr = aln->STATE_UNKNOWN * block;
+        for (x = 0; x < block; x++) {
+            partial_lh_leaf[addr+x] = 1.0;
+        }
+
 
     	// now do the real computation
 #ifdef _OPENMP
-#pragma omp parallel for reduction(+: tree_lh, prob_const) private(ptn, i, c, m)
+#pragma omp parallel for private(ptn, i, c, m, x)
 #endif
     	for (ptn = 0; ptn < nptn; ptn++) {
-			double lh_ptn = ptn_invar[ptn];
-			double *lh_cat = _pattern_lh_cat + ptn*catmix;
+            double *lh_state = ptn_ancestral_prob + ptn*nstates;
 			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
-			int state_dad = (ptn < orig_nptn) ? (aln->at(ptn))[dad->id] : model_factory->unobserved_ptns[ptn-orig_nptn];
-			double *lh_node = partial_lh_node + state_dad*block;
-			for (m = 0; m < nmixture; m++) {
+			int state_dad = (aln->at(ptn))[dad->id];
+			double *lh_leaf = partial_lh_leaf + state_dad*block;
+            for (m = 0; m < nmixture; m++) {
+                double *this_inv_evec = inv_evec + (m*nstatesqr); 
 				for (c = 0; c < ncat; c++) {
-					for (i = 0; i < nstates; i++) {
-						*lh_cat += lh_node[i] * partial_lh_dad[i];
+					// compute real partial likelihood vector
+					for (x = 0; x < nstates; x++) {
+						double vnode = 0.0;
+						for (i = 0; i < nstates; i++) {
+							vnode += this_inv_evec[i*nstates+x] * partial_lh_dad[i];
+						}
+						lh_state[x] += lh_leaf[x] * vnode;
 					}
-					lh_node += nstates;
-					partial_lh_dad += nstates;
-                    lh_ptn += *lh_cat;
-					lh_cat++;
-				}
-                
-			}
-//			assert(lh_ptn > 0.0);
-			if (ptn < orig_nptn) {
-				lh_ptn = log(fabs(lh_ptn));
-				_pattern_lh[ptn] = lh_ptn;
-				tree_lh += lh_ptn * ptn_freq[ptn];
-			} else {
-                // bugfix 2016-01-21, prob_const can be rescaled
-                if (dad_branch->scale_num[ptn] >= 1)
-                    lh_ptn *= SCALING_THRESHOLD;
-				prob_const += lh_ptn;
-			}
+                    lh_leaf += nstates;
+                    partial_lh_dad += nstates;
+                }
+            }
+            
+            double lh_sum = lh_state[0];
+            for (x = 1; x < nstates; x++)
+                lh_sum += lh_state[x];
+            lh_sum = 1.0/lh_sum;
+            for (x = 0; x < nstates; x++)
+                lh_state[x] *= lh_sum;
 		}
-		delete [] partial_lh_node;
     } else {
     	// both dad and node are internal nodes
 #ifdef _OPENMP
-#pragma omp parallel for reduction(+: tree_lh, prob_const) private(ptn, i, c, m)
+#pragma omp parallel for private(ptn, i, c, m, x)
 #endif
     	for (ptn = 0; ptn < nptn; ptn++) {
-			double lh_ptn = ptn_invar[ptn];
-			double *lh_cat = _pattern_lh_cat + ptn*catmix;
+            double *lh_state = ptn_ancestral_prob + ptn*nstates;
 			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
 			double *partial_lh_node = node_branch->partial_lh + ptn*block;
-			double *val_tmp = val;
+
 			for (m = 0; m < nmixture; m++) {
+                double *this_inv_evec = inv_evec + (m*nstatesqr); 
 				for (c = 0; c < ncat; c++) {
-					for (i = 0; i < nstates; i++) {
-						*lh_cat +=  val_tmp[i] * partial_lh_node[i] * partial_lh_dad[i];
+					// compute real partial likelihood vector
+					for (x = 0; x < nstates; x++) {
+						double vdad = 0.0, vnode = 0.0;
+						size_t addr = (m*ncat+c)*nstatesqr+x*nstates;
+						for (i = 0; i < nstates; i++) {
+							vdad += echild[addr+i] * partial_lh_node[m*statecat+c*nstates+i];
+                            vnode += this_inv_evec[i*nstates+x] * partial_lh_dad[m*statecat+c*nstates+i];
+						}
+						lh_state[x] += vnode*vdad;
 					}
-					lh_ptn += *lh_cat;
-					partial_lh_node += nstates;
-					partial_lh_dad += nstates;
-					val_tmp += nstates;
-					lh_cat++;
 				}
 			}
 
-			assert(lh_ptn > 0.0);
-            if (ptn < orig_nptn) {
-				lh_ptn = log(lh_ptn);
-				_pattern_lh[ptn] = lh_ptn;
-				tree_lh += lh_ptn * ptn_freq[ptn];
-			} else {
-                // bugfix 2016-01-21, prob_const can be rescaled
-                if (dad_branch->scale_num[ptn] + node_branch->scale_num[ptn] >= 1)
-                    lh_ptn *= SCALING_THRESHOLD;
-				prob_const += lh_ptn;
-			}
+            double lh_sum = lh_state[0];
+            for (x = 1; x < nstates; x++)
+                lh_sum += lh_state[x];
+            lh_sum = 1.0/lh_sum;
+            for (x = 0; x < nstates; x++)
+                lh_state[x] *= lh_sum;
+
 		}
     }
+}
 
+void PhyloTree::computeJointAncestralSequences(int *ancestral_seqs) {
 
-    if (orig_nptn < nptn) {
-    	// ascertainment bias correction
-    	prob_const = log(1.0 - prob_const);
-    	for (ptn = 0; ptn < orig_nptn; ptn++)
-    		_pattern_lh[ptn] -= prob_const;
-    	tree_lh -= aln->getNSite()*prob_const;
-		assert(!isnan(tree_lh) && !isinf(tree_lh));
+    // step 1-3 of the dynamic programming algorithm of Pupko et al. 2000, MBE 17:890-896
+    assert(root->isLeaf());
+    int *C = new int[(size_t)getAlnNPattern()*model->num_states*leafNum];
+    computeAncestralLikelihood((PhyloNeighbor*)root->neighbors[0], NULL, C);
+    
+    // step 4-5 of the dynamic programming algorithm of Pupko et al. 2000, MBE 17:890-896
+    computeAncestralState((PhyloNeighbor*)root->neighbors[0], NULL, C, ancestral_seqs);
+    
+    clearAllPartialLH();
+    
+    delete[] C;
+}
+
+void PhyloTree::computeAncestralLikelihood(PhyloNeighbor *dad_branch, PhyloNode *dad, int *C) {
+    PhyloNode *node = (PhyloNode*)dad_branch->node;
+    if (node->isLeaf())
+        return;
+    
+    int num_leaves = 0;
+    
+    // recursive into subtree
+    FOR_NEIGHBOR_DECLARE(node, dad, it) {
+        if ((*it)->node->isLeaf()) {
+            num_leaves++;
+        } else {
+            computeAncestralLikelihood((PhyloNeighbor*)(*it), node, C);
+        }
     }
 
-	assert(!isnan(tree_lh) && !isinf(tree_lh));
+    // TODO mem save
+    if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
+        // re-orient partial_lh
+        bool done = false;
+        FOR_NEIGHBOR_IT(node, dad, it2) {
+            PhyloNeighbor *backnei = ((PhyloNeighbor*)(*it2)->node->findNeighbor(node));
+            if (backnei->partial_lh) {
+                dad_branch->partial_lh = backnei->partial_lh;
+                dad_branch->scale_num = backnei->scale_num;
+                backnei->partial_lh = NULL;
+                backnei->scale_num = NULL;
+                backnei->partial_lh_computed &= ~1; // clear bit
+                done = true;
+                break;
+            }
+        }
+        assert(done && "partial_lh is not re-oriented");
+    }
+    
+    size_t nptn = aln->getNPattern();
+    size_t ptn;
+    size_t nstates = model->num_states;
+    size_t nstatesqr = nstates*nstates;
+    size_t parent, child;
+    double *trans_mat = new double[nstatesqr];
+    double *lh_leaves = NULL;
+    if (num_leaves > 0) {
+        lh_leaves = new double[(aln->STATE_UNKNOWN+1)*nstates*num_leaves];
+    }
+    if (dad) {
+        model->computeTransMatrix(dad_branch->length, trans_mat);
+        for (parent = 0; parent < nstatesqr; parent++)
+            trans_mat[parent] = log(trans_mat[parent]);
+    } else {
+        model->getStateFrequency(trans_mat);
+        for (parent = 0; parent < nstates; parent++)
+            trans_mat[parent] = log(trans_mat[parent]);
+        for (parent = 1; parent < nstates; parent++)
+            memcpy(trans_mat+parent*nstates, trans_mat, sizeof(double)*nstates);
+    }
+    
+    // compute information buffer for leaves
+	int ambi_aa[] = {
+        4+8, // B = N or D
+        32+64, // Z = Q or E
+        512+1024 // U = I or L
+    };
+    int leafid = 0; 
+    FOR_NEIGHBOR(node, dad, it) {
+        if ((*it)->node->isLeaf()) {
+            double trans_leaf[nstatesqr];
+            model->computeTransMatrix((*it)->length, trans_leaf);
+            double *lh_leaf = lh_leaves+leafid*nstates*(aln->STATE_UNKNOWN+1);
+            
+            // assign lh_leaf for normal states
+            for (parent = 0; parent < nstates; parent++)
+                for (child = 0; child < nstates; child++)
+                    lh_leaf[child*nstates+parent] = log(trans_leaf[parent*nstates+child]);
+            
+            // for unknown state
+            double *this_lh_leaf = lh_leaf + (aln->STATE_UNKNOWN*nstates);
+            for (parent = 0; parent < nstates; parent++)
+                this_lh_leaf[parent] = 0.0;
+            
+            // special treatment for ambiguous characters
+            switch (aln->seq_type) {
+            case SEQ_DNA:
+                for (int state = 4; state < 18; state++) {
+                    this_lh_leaf = lh_leaf + (state*nstates);
+                    int cstate = state-nstates+1;
+                    for (parent = 0; parent < nstates; parent++) {
+                        double sumlh = 0.0;
+                        for (child = 0; child < nstates; child++) {
+                            if ((cstate) & (1 << child))
+                                sumlh += trans_leaf[parent*nstates+child];
+                        }
+                        this_lh_leaf[parent] = log(sumlh);
+                    }
+                }
+                break;
+            case SEQ_PROTEIN:
+                for (int state = 0; state < sizeof(ambi_aa)/sizeof(int); state++) {
+                    this_lh_leaf = lh_leaf + ((state+20)*nstates);
+                    for (parent = 0; parent < nstates; parent++) {
+                        double sumlh = 0.0;                
+                        for (child = 0; child < nstates; child++) {
+                            if (ambi_aa[state] & (1 << child))
+                                sumlh += trans_leaf[parent*nstates+child];
+                        }
+                        this_lh_leaf[parent] = log(sumlh);
+                    }
+                }
+                break;
+            default:
+                break;
+            }
+            leafid++;
+        }
+    }
 
-    delete [] val;
-    return tree_lh;
+    // initialize L_y(i) and C_y(i)
+//    memset(dad_branch->partial_lh, 0, nptn*nstates*sizeof(double));
+
+    int *C_node = C + (node->id-leafNum)*nptn*nstates;
+
+    for (ptn = 0; ptn < nptn; ptn++) {
+        double *lh_dad = dad_branch->partial_lh+ptn*nstates;
+        int *this_C_node = C_node + (ptn*nstates);
+        leafid = 0;
+        double sumlh[nstates];
+        memset(sumlh, 0, sizeof(double)*nstates);
+        FOR_NEIGHBOR(node, dad, it) {
+            PhyloNeighbor *childnei = (PhyloNeighbor*)(*it);
+            if ((*it)->node->isLeaf()) {
+                double *lh_leaf = lh_leaves+leafid*nstates*(aln->STATE_UNKNOWN+1); 
+                // external node
+                int state_child;
+                state_child = (aln->at(ptn))[(*it)->node->id];
+                double *child_lh = lh_leaf + state_child*nstates;
+                for (child = 0; child < nstates; child++)
+                    sumlh[child] += child_lh[child];
+                leafid++;
+            } else {
+                double *child_lh = childnei->partial_lh + ptn*nstates;
+                for (child = 0; child < nstates; child++)
+                    sumlh[child] += child_lh[child];
+            }
+        }
+        
+        
+        if (dad) {
+            // internal node
+            for (parent = 0; parent < nstates; parent++) {
+                lh_dad[parent] = trans_mat[parent*nstates] + sumlh[0];
+                this_C_node[parent] = 0;
+                for (child = 1; child < nstates; child++) {
+                    double lh = trans_mat[parent*nstates+child] + sumlh[child];
+                    if (lh > lh_dad[parent]) {
+                        lh_dad[parent] = lh;
+                        this_C_node[parent] = child;
+                    }
+                }
+            }
+        } else {
+            // at the root
+            lh_dad[0] = trans_mat[0] + sumlh[0];
+            this_C_node[0] = 0;
+            for (parent = 1; parent < nstates; parent++) {
+                double lh = trans_mat[parent] + sumlh[parent];
+                if (lh > lh_dad[0]) {
+                    lh_dad[0] = lh;
+                    this_C_node[0] = parent;
+                }
+            }
+        }
+    }
+    
+
+    if (lh_leaves)
+        delete[] lh_leaves;
+    delete[] trans_mat;
+}
+
+
+void PhyloTree::computeAncestralState(PhyloNeighbor *dad_branch, PhyloNode *dad, int *C, int *ancestral_seqs) {
+    PhyloNode *node = (PhyloNode*)dad_branch->node;
+    if (node->isLeaf())
+        return;
+
+    size_t nptn = aln->getNPattern();
+    size_t ptn;
+    size_t nstates = model->num_states;
+
+    int *C_node = C + (node->id-leafNum)*nptn*nstates;
+    int *ancestral_seqs_node = ancestral_seqs + (node->id-leafNum)*nptn; 
+    if (dad) {
+        // at an internal node
+        int *ancestral_seqs_dad = ancestral_seqs + (dad->id-leafNum)*nptn;
+        for (ptn = 0; ptn < nptn; ptn++)
+            ancestral_seqs_node[ptn] = C_node[ptn*nstates+ancestral_seqs_dad[ptn]];
+        
+    } else {
+        // at the root
+        for (ptn = 0; ptn < nptn; ptn++)
+            ancestral_seqs_node[ptn] = C_node[ptn*nstates];
+    }
+    FOR_NEIGHBOR_IT(node, dad, it)
+        computeAncestralState((PhyloNeighbor*)(*it), node, C, ancestral_seqs);
 }
+
+
+
diff --git a/pllnni.cpp b/pllnni.cpp
old mode 100755
new mode 100644
index 9c71a91..550a9c8
--- a/pllnni.cpp
+++ b/pllnni.cpp
@@ -1,3 +1,24 @@
+/***************************************************************************
+ *   Copyright (C) 2014 by                                            *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -21,7 +42,7 @@ extern VerboseMode verbose_mode;
 int NNI_MAX_NR_STEP = 10;
 
 /* program options */
-extern Params *globalParam;
+extern Params *globalParams;
 extern Alignment *globalAlignment;
 
 /**
@@ -244,8 +265,8 @@ set<int> getAffectedNodes(pllInstance* tr, nodeptr p) {
 
 void pllEvalAllNNIs(pllInstance *tr, partitionList *pr, SearchInfo &searchinfo) {
     /* DTH: mimic IQTREE::optimizeNNI 's first call to IQTREE::saveCurrentTree */
-    if((globalParam->online_bootstrap == PLL_TRUE) &&
-            (globalParam->gbo_replicates > 0)){
+    if((globalParams->online_bootstrap == PLL_TRUE) &&
+            (globalParams->gbo_replicates > 0)){
         tr->fastScaling = PLL_FALSE;
         pllEvaluateLikelihood(tr, pr, tr->start, PLL_FALSE, PLL_TRUE);
         pllSaveCurrentTree(tr, pr, tr->start);
@@ -294,7 +315,7 @@ double pllDoNNISearch(pllInstance* tr, partitionList *pr, SearchInfo &searchinfo
 	/* evaluate NNIs */
 	pllEvalAllNNIs(tr, pr, searchinfo);
 
-	if (searchinfo.speednni) {
+	if (globalParams->speednni) {
 		searchinfo.aBranches.clear();
 	}
 
@@ -322,7 +343,7 @@ double pllDoNNISearch(pllInstance* tr, partitionList *pr, SearchInfo &searchinfo
 		for (vector<pllNNIMove>::iterator it = selectedNNIs.begin(); it != selectedNNIs.end(); it++) {
 			/* do the topological change */
 			doOneNNI(tr, pr, (*it).p, (*it).nniType, TOPO_ONLY);
-			if (searchinfo.speednni) {
+			if (globalParams->speednni) {
 				vector<string> aBranches = getAffectedBranches(tr, (*it).p);
 				searchinfo.aBranches.insert(aBranches.begin(), aBranches.end());
 			}
@@ -338,7 +359,7 @@ double pllDoNNISearch(pllInstance* tr, partitionList *pr, SearchInfo &searchinfo
 		if (selectedNNIs.size() != 0) {
 			//pllEvaluateLikelihood(tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
 			pllOptimizeBranchLengths(tr, pr, 1);
-			if (globalParam->count_trees) {
+			if (globalParams->count_trees) {
 	            countDistinctTrees(tr, pr);
 			}
 			int numNNI = selectedNNIs.size();
@@ -504,7 +525,7 @@ double doOneNNI(pllInstance *tr, partitionList *pr, nodeptr p, int swap, NNI_Typ
     }
     // Optimize the central branch
     pllOptimizeOneBranch(tr, pr, p);
-    if((globalParam->online_bootstrap == PLL_TRUE) && (globalParam->gbo_replicates > 0)){
+    if((globalParams->online_bootstrap == PLL_TRUE) && (globalParams->gbo_replicates > 0)){
         tr->fastScaling = PLL_FALSE;
         pllEvaluateLikelihood(tr, pr, p, PLL_FALSE, PLL_TRUE); // DTH: modified the last arg
         pllSaveCurrentTree(tr, pr, p);
@@ -571,8 +592,8 @@ double doOneNNI(pllInstance *tr, partitionList *pr, nodeptr p, int swap, NNI_Typ
         else
             pllUpdatePartials(tr, pr, r, PLL_FALSE);
         pllOptimizeOneBranch(tr, pr, r);
-        if((globalParam->online_bootstrap == PLL_TRUE) &&
-                        (globalParam->gbo_replicates > 0)){
+        if((globalParams->online_bootstrap == PLL_TRUE) &&
+                        (globalParams->gbo_replicates > 0)){
             tr->fastScaling = PLL_FALSE;
             pllEvaluateLikelihood(tr, pr, r, PLL_FALSE, PLL_TRUE); // DTH: modified the last arg
             pllSaveCurrentTree(tr, pr, r);
@@ -674,7 +695,7 @@ int evalNNIForBran(pllInstance* tr, partitionList *pr, nodeptr p, SearchInfo &se
 
 	/* do an NNI move of type 1 */
 	double lh1 = doOneNNI(tr, pr, p, 0, searchinfo.nni_type, &searchinfo);
-	if (globalParam->count_trees)
+	if (globalParams->count_trees)
 		countDistinctTrees(tr, pr);
 	pllNNIMove nni1;
 	nni1.p = p;
@@ -709,7 +730,7 @@ int evalNNIForBran(pllInstance* tr, partitionList *pr, nodeptr p, SearchInfo &se
 
 	/* do an NNI move of type 2 */
 	double lh2 = doOneNNI(tr, pr, p, 1, searchinfo.nni_type, &searchinfo);
-	if (globalParam->count_trees)
+	if (globalParams->count_trees)
 		countDistinctTrees(tr, pr);
 
 	// Create the nniMove struct to store this move
@@ -789,7 +810,7 @@ bool isAffectedBranch(nodeptr p, SearchInfo &searchinfo) {
 
 void evalNNIForSubtree(pllInstance* tr, partitionList *pr, nodeptr p, SearchInfo &searchinfo) {
 	if (!isTip(p->number, tr->mxtips) && !isTip(p->back->number, tr->mxtips)) {
-		if (searchinfo.speednni && searchinfo.curNumNNISteps != 1) {
+		if (globalParams->speednni && searchinfo.curNumNNISteps != 1) {
 			if (isAffectedBranch(p, searchinfo)) {
 				evalNNIForBran(tr, pr, p, searchinfo);
 			}
@@ -906,14 +927,14 @@ void pllSaveCurrentTree(pllInstance* tr, partitionList *pr, nodeptr p){
         // online bootstrap
         int nptn = pllUFBootDataPtr->n_patterns;
         int updated = 0;
-        int nsamples = globalParam->gbo_replicates;
+        int nsamples = globalParams->gbo_replicates;
         for (int sample = 0; sample < nsamples; sample++) {
             double rell = 0.0;
             for (int ptn = 0; ptn < nptn; ptn++)
                 rell += pattern_lh[ptn] * pllUFBootDataPtr->boot_samples[sample][ptn];
 
-            if (rell > pllUFBootDataPtr->boot_logl[sample] + globalParam->ufboot_epsilon ||
-                (rell > pllUFBootDataPtr->boot_logl[sample] - globalParam->ufboot_epsilon &&
+            if (rell > pllUFBootDataPtr->boot_logl[sample] + globalParams->ufboot_epsilon ||
+                (rell > pllUFBootDataPtr->boot_logl[sample] - globalParams->ufboot_epsilon &&
                     random_double() <= 1.0/(pllUFBootDataPtr->boot_counts[sample]+1))) {
 //                if (!globalParam->store_candidate_trees)
                 {
@@ -928,7 +949,7 @@ void pllSaveCurrentTree(pllInstance* tr, partitionList *pr, nodeptr p){
                     }
                 }
                 if (rell <= pllUFBootDataPtr->boot_logl[sample] +
-                        globalParam->ufboot_epsilon) {
+                        globalParams->ufboot_epsilon) {
                     pllUFBootDataPtr->boot_counts[sample]++;
                 } else {
                     pllUFBootDataPtr->boot_counts[sample] = 1;
diff --git a/pllnni.h b/pllnni.h
index a7a0f62..cd660a3 100644
--- a/pllnni.h
+++ b/pllnni.h
@@ -1,3 +1,24 @@
+/***************************************************************************
+ *   Copyright (C) 2014 by                                            *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
 #ifndef NNISEARCH_H
 #define NNISEARCH_H
 
@@ -31,7 +52,6 @@ typedef struct {
 
 typedef struct {
     // FOR GENERAL TREE SEARCH
-	bool speednni;
 	vector<pllNNIMove> posNNIList; // positive NNIs
 	unordered_set<string> aBranches; // Set of branches that are affected by the previous NNIs
 	double curLogl; // Current tree log-likelihood
diff --git a/quartet.cpp b/quartet.cpp
index ca2dc5d..e21007d 100644
--- a/quartet.cpp
+++ b/quartet.cpp
@@ -921,7 +921,7 @@ void PhyloTree::computeQuartetLikelihoods(vector<QuartetInfo> &lmap_quartet_info
             // set up parameters
             quartet_tree->setParams(params);
             quartet_tree->optimize_by_newton = params->optimize_by_newton;
-            quartet_tree->setLikelihoodKernel(params->SSE);
+            quartet_tree->setLikelihoodKernel(params->SSE, num_threads);
 
             // set up partition model
             if (isSuperTree()) {
diff --git a/split.cpp b/split.cpp
index 74a1506..62495d3 100644
--- a/split.cpp
+++ b/split.cpp
@@ -1,6 +1,8 @@
 /***************************************************************************
- *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
- *   minh.bui at univie.ac.at   *
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
diff --git a/split.h b/split.h
index 640ac82..4a4343b 100644
--- a/split.h
+++ b/split.h
@@ -1,6 +1,8 @@
 /***************************************************************************
- *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
- *   minh.bui at univie.ac.at   *
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
@@ -272,6 +274,7 @@ public:
 	Split *extractSubSplit(Split &taxa_mask);
 
 	string &getName() { return name; }
+
 protected:
 	/**
 		number of taxa
diff --git a/splitgraph.cpp b/splitgraph.cpp
index 3479c14..4737268 100644
--- a/splitgraph.cpp
+++ b/splitgraph.cpp
@@ -639,7 +639,7 @@ void SplitGraph::scaleWeight(double norm, bool make_int, int precision) {
 		else 
 			(*itg)->setWeight( round((*itg)->getWeight()*norm*pow((double)10.0,precision))/pow((double)10.0,precision));
 }
-
+// TODO Implement a more efficient function using Hash Table
 bool SplitGraph::containSplit(Split &sp) {
 	Split invert_sp(sp);
 	invert_sp.invert();
diff --git a/splitgraph.h b/splitgraph.h
index f7e5dd8..1960fd2 100644
--- a/splitgraph.h
+++ b/splitgraph.h
@@ -383,7 +383,7 @@ public:
 	 * @return number of trivial splits removed
 	*/
 	int removeTrivialSplits();
-
+    
 protected:
 
 	/**
diff --git a/splitset.cpp b/splitset.cpp
index 555b1a1..5e4a8e0 100644
--- a/splitset.cpp
+++ b/splitset.cpp
@@ -1,6 +1,8 @@
 /***************************************************************************
- *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
- *   minh.bui at univie.ac.at   *
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
diff --git a/splitset.h b/splitset.h
index 890337d..bab520f 100644
--- a/splitset.h
+++ b/splitset.h
@@ -1,6 +1,8 @@
 /***************************************************************************
- *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
- *   minh.bui at univie.ac.at   *
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
diff --git a/stoprule.cpp b/stoprule.cpp
index a5018bf..8a5e4a1 100644
--- a/stoprule.cpp
+++ b/stoprule.cpp
@@ -19,6 +19,7 @@
  ***************************************************************************/
 #include "stoprule.h"
 #include "timeutil.h"
+#include "MPIHelper.h"
 
 StopRule::StopRule() : CheckpointFactory()
 {
@@ -35,6 +36,7 @@ StopRule::StopRule() : CheckpointFactory()
 	start_real_time = -1.0;
 	max_run_time = -1.0;
 	curIteration = 0;
+    should_stop = false;
 }
 
 void StopRule::initialize(Params &params) {
@@ -49,6 +51,14 @@ void StopRule::initialize(Params &params) {
 	max_run_time = params.maxtime * 60; // maxtime is in minutes
 }
 
+void StopRule::getUFBootCountCheck(int &ufboot_count, int &ufboot_count_check) {
+    int step = step_iteration;
+    while (step*2 < MPIHelper::getInstance().getNumProcesses())
+        step *= 2;
+    ufboot_count = (curIteration/(step/2)+1)*(step/2);
+    ufboot_count_check = (curIteration/step+1)*step;
+}
+
 StopRule::~StopRule()
 {
 }
@@ -96,32 +106,34 @@ void StopRule::restoreCheckpoint() {
 //		return ((cur_iteration+step_iteration-1)/step_iteration)*step_iteration;
 //	case SC_REAL_TIME:
 ////		return ((max_run_time - realtime_secs)/max_run_time);
-//		assert(0); // TODO
+//		assert(0);
 //		return 0;
 //	}
 //}
 
 bool StopRule::meetStopCondition(int cur_iteration, double cur_correlation) {
+    if (should_stop)
+        return true;
 	switch (stop_condition) {
-	case SC_FIXED_ITERATION:
-		return cur_iteration > min_iteration;
-	case SC_WEIBULL:
-		if (predicted_iteration == 0)
-			return cur_iteration > min_iteration;
-		else
-			return cur_iteration > predicted_iteration;
-	case SC_UNSUCCESS_ITERATION:
-		return cur_iteration >= getLastImprovedIteration() + unsuccess_iteration;
-	case SC_BOOTSTRAP_CORRELATION:
-		return ((cur_correlation >= min_correlation) && (cur_iteration >= getLastImprovedIteration() + unsuccess_iteration))
-				|| cur_iteration > max_iteration;
-	case SC_REAL_TIME:
-		return (getRealTime() - start_real_time >= max_run_time);
+		case SC_FIXED_ITERATION:
+			return cur_iteration >= min_iteration;
+		case SC_WEIBULL:
+			if (predicted_iteration == 0)
+				return cur_iteration > min_iteration;
+			else
+				return cur_iteration > predicted_iteration;
+		case SC_UNSUCCESS_ITERATION:
+			return cur_iteration > getLastImprovedIteration() + unsuccess_iteration;
+		case SC_BOOTSTRAP_CORRELATION:
+			return ((cur_correlation >= min_correlation) && (cur_iteration > getLastImprovedIteration() + unsuccess_iteration))
+				   || cur_iteration > max_iteration;
+		case SC_REAL_TIME:
+			return (getRealTime() - start_real_time >= max_run_time);
 	}
 	return false;
 }
 
-double StopRule::getRemainingTime(int cur_iteration, double cur_correlation) {
+double StopRule::getRemainingTime(int cur_iteration) {
 	double realtime_secs = getRealTime() - start_real_time;
 	int niterations;
 	switch (stop_condition) {
diff --git a/stoprule.h b/stoprule.h
index 972a714..7cb9ea5 100644
--- a/stoprule.h
+++ b/stoprule.h
@@ -42,6 +42,8 @@ public:
 	*/
     ~StopRule();
 
+    void getUFBootCountCheck(int &ufboot_count, int &ufboot_count_check);
+
     /**
         save object into the checkpoint
     */
@@ -77,9 +79,17 @@ public:
 		@return TRUE if stop condition is met, FALSE otherwise
 	*/
 	bool meetStopCondition(int cur_iteration, double cur_correlation);
+
+    /**
+        return TRUE if cur_correlation is high enough
+        @param cur_correlation correlation coefficient
+    */
+    bool meetCorrelation(double cur_correlation) {
+        return cur_correlation >= min_correlation;
+    }
 	
 	/** get the remaining time to converge, in seconds */
-	double getRemainingTime(int cur_iteration, double cur_correlation);
+	double getRemainingTime(int cur_iteration);
 
 	/**
 		@return the number of iterations required to stop the search
@@ -100,6 +110,10 @@ public:
         StopRule::curIteration = curIteration;
     }
 
+    void shouldStop() {
+        should_stop = true;
+    }
+
 private:
 
     /**
@@ -149,6 +163,9 @@ private:
     /** starting real time of the program */
     double start_real_time;
 
+    /** TRUE to override stop condition */
+    bool should_stop;
+
 	/* FOLLOWING CODES ARE FROM IQPNNI version 3 */	
 
 //	int nTime_;
diff --git a/superalignment.cpp b/superalignment.cpp
index 7311455..8f56c40 100644
--- a/superalignment.cpp
+++ b/superalignment.cpp
@@ -23,15 +23,26 @@
 #include "superalignment.h"
 #include "phylosupertree.h"
 
-SuperAlignment::SuperAlignment()
- : Alignment() {}
+SuperAlignment::SuperAlignment() : Alignment() {
+    max_num_states = 0;
+}
 
-SuperAlignment::SuperAlignment(PhyloSuperTree *super_tree)
- : Alignment()
+SuperAlignment::SuperAlignment(PhyloSuperTree *super_tree) : Alignment()
 {
+    max_num_states = 0;
 	// first build taxa_index and partitions
 	int site, seq, nsite = super_tree->size();
 	PhyloSuperTree::iterator it;
+
+    // BUG FIX 2016-11-29: when merging partitions with -m TESTMERGE, sequence order is changed
+    // get the taxa names from existing tree
+    if (super_tree->root) {
+        super_tree->getTaxaName(seq_names);
+        taxa_index.resize(seq_names.size());
+        for (auto i = taxa_index.begin(); i != taxa_index.end(); i++)
+            i->resize(nsite, -1);
+    }
+        
 	for (site = 0, it = super_tree->begin(); it != super_tree->end(); it++, site++) {
 		partitions.push_back((*it)->aln);
 		int nseq = (*it)->aln->getNSeq();
@@ -599,3 +610,50 @@ Alignment *SuperAlignment::concatenateAlignments(IntVector &ids) {
 
 	return aln;
 }
+
+void SuperAlignment::countConstSite() {
+    num_informative_sites = 0;
+    max_num_states = 0;
+    frac_const_sites = 0;
+    frac_invariant_sites = 0;
+    size_t nsites = 0;
+    for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
+        (*it)->countConstSite();
+        num_informative_sites += (*it)->num_informative_sites;
+        if ((*it)->num_states > max_num_states)
+            max_num_states = (*it)->num_states;
+        nsites += (*it)->getNSite();
+        frac_const_sites += (*it)->frac_const_sites * (*it)->getNSite();
+        frac_invariant_sites += (*it)->frac_invariant_sites * (*it)->getNSite();
+    }
+    frac_const_sites /= nsites;
+    frac_invariant_sites /= nsites;
+}
+
+void SuperAlignment::orderPatternByNumChars() {
+    const int UINT_BITS = sizeof(UINT)*8;
+    int maxi = (num_informative_sites+UINT_BITS-1)/UINT_BITS;
+    pars_lower_bound = new UINT[maxi+1];
+    memset(pars_lower_bound, 0, (maxi+1)*sizeof(UINT));
+    int part, nseq = getNSeq(), npart = partitions.size();
+    
+    // compute ordered_pattern
+    ordered_pattern.clear();
+    UINT sum_scores[npart];
+    for (part  = 0; part != partitions.size(); part++) {
+        partitions[part]->orderPatternByNumChars();
+        // partial_partition
+        for (vector<Pattern>::iterator pit = partitions[part]->ordered_pattern.begin(); pit != partitions[part]->ordered_pattern.end(); pit++) {
+            Pattern pattern(*pit);
+            pattern.resize(nseq); // maximal unknown states
+            for (int j = 0; j < nseq; j++)
+                if (taxa_index[j][part] >= 0)
+                    pattern[j] = (*pit)[taxa_index[j][part]];
+                else
+                    pattern[j] = partitions[part]->STATE_UNKNOWN;
+            ordered_pattern.push_back(pattern);
+        }
+        sum_scores[part] = partitions[part]->pars_lower_bound[0];
+    }
+    // TODO compute pars_lower_bound (lower bound of pars score for remaining patterns)
+}
diff --git a/superalignment.h b/superalignment.h
index 583dd0a..ba47436 100644
--- a/superalignment.h
+++ b/superalignment.h
@@ -215,6 +215,22 @@ public:
 	 */
 	void buildPattern();
 
+    /**
+            count the fraction of constant sites in the alignment, update the variable frac_const_sites
+     */
+    virtual void countConstSite();
+
+    /**
+     * 	@return number of states, if it is a partition model, return max num_states across all partitions
+     */
+    virtual int getMaxNumStates() {
+    	return max_num_states;
+    }
+
+    /** order pattern by number of character states and return in ptn_order
+    */
+    virtual void orderPatternByNumChars();
+
 	/**
 		actual partition alignments
 	*/
@@ -225,6 +241,9 @@ public:
 	*/
 	vector<IntVector> taxa_index;
 
+	/** maximum number of states across all partitions */
+	int max_num_states;
+
 	/**
 	 * concatenate subset of alignments
 	 * @param ids IDs of sub-alignments
diff --git a/test_scripts/README b/test_scripts/README
index f896292..890d622 100644
--- a/test_scripts/README
+++ b/test_scripts/README
@@ -1,19 +1,21 @@
-1. Complile your local branch: 
-    ./compile.sh <your_branch>
+1. Complile your local branch:
+    ./compile.sh <branch_name> [<iqtree_flags>]
     EXAMPLE: ./compile.sh master
-You might also want to 'pull' code from the remote server to update your branch before performing the compilation. The binary of your branch will be stored in 'iqtree_binaries' directory. A binary of the most recent IQ-TREE release will also be compiled and stored in the folder.
+You might also want to 'pull' code from the remote server to update your branch before performing the compilation.
+The binary of your branch will be stored in 'iqtree_binaries' directory.
 
-2. If you want to run the standard tests, use the gen_test_standrd.py script (running the script without any option output the help menu) as follows: 
+2. Prepare the config_file (see test_configs.txt for an example)
+
+3. If you want to run the standard tests, use the gen_test_standrd.py script (running the script without any option output the help menu) as follows:
     ./gen_test_standard.py -b <path_to_your_iqtree_binary> -c <config_file>
     EXAMPLE: ./gen_test_standard.py -b iqtree_binaries/iqtree_master -c test_configs.txt
-A text file named '<your_binary_name>_test_standard_cmds.txt' containing all the test commnds will be generated. Copy all the content of the test_script folder to libby. Submit the job with the following commands: 
+A text file named '<your_binary_name>_test_standard_cmds.txt' containing all the test commands will be generated. Copy all the content of the test_script folder to libby. Submit the job with the following commands:
     ./submit_jobs.sh <number_of_threads> <cmd_file> <aln_dir> <out_dir> <binary_dir>
-    EXAMPLE: ./submit_jobs.sh 16 iqtree_master_test_standard_cmds.txt test_alignments iqtree_master_test_standard iqtree_binaries 
-The LOG FILE containing the status of all jobs are writen in <out_dir>/<cmd_file>[0-9]*.log. Look into the file to see whether all jobs have run successfully. Grep for "ERROR" to see which job contains BUG.  
+    EXAMPLE: ./submit_jobs.sh 16 iqtree_master_test_standard_cmds.txt test_alignments iqtree_master_test_standard iqtree_binaries
+The LOG FILE containing the status of all jobs are writen in <out_dir>/<cmd_file>[0-9]*.log. Look into the file to see whether all jobs have run successfully. Grep for "ERROR" to see which job contains BUG.
 
-3. If you want to test all the commands by users of the web server that caused bugs: (./gen_test_standard.py -h for help)
+4. If you want to test all the commands by users of the web server that caused bugs: (./gen_test_standard.py -h for help)
     ./gen_test_standard.py -b <path_to_iqtree_binary>
     EXAMPLE: ./gen_test_standard.py -b iqtree_binaries/iqtree_master
-The above command creates a folder called 'webserver_alignments' that contains all the user alignments. The next steps are the same as described in 2. 
+The above command creates a folder called 'webserver_alignments' that contains all the user alignments. The next steps are the same as described in 2.
     EXAMPLE: ./submit_jobs.sh 40 iqtree_master_test_webserver_cmds.txt webserver_alignments iqtree_master_test_webserver iqtree_binaries
-
diff --git a/test_scripts/compile.sh b/test_scripts/compile.sh
index b532b0a..6f9287a 100755
--- a/test_scripts/compile.sh
+++ b/test_scripts/compile.sh
@@ -1,19 +1,18 @@
-#!/bin/bash - 
+#!/bin/bash -
 #===============================================================================
 #
-#          FILE: compile_binary.sh
-# 
-#         USAGE: ./compile_binary.sh 
-# 
-#   DESCRIPTION: This script checkouts the last release version of IQ-TREE and the HEAD of
-#                the current branch. Then it complile both version
-# 
+#          FILE: compile.sh
+#
+#         USAGE: ./compile.sh
+#
+#   DESCRIPTION: This script checkouts and compile the specified branch of IQ-TREE
+#
 #       OPTIONS: ---
 #  REQUIREMENTS: ---
 #          BUGS: ---
 #         NOTES: ---
-#        AUTHOR: Tung Nguyen (nltung at gmail.com) 
-#  ORGANIZATION: 
+#        AUTHOR: Tung Nguyen (nltung at gmail.com)
+#  ORGANIZATION:
 #       CREATED: 2015-01-26 13:02:57 CET
 #      REVISION:  ---
 #===============================================================================
@@ -51,70 +50,78 @@ require_clean_work_tree () {
 #Check whether the git work tree is clean
 #require_clean_work_tree
 
-if [ "$#" != 1 ]
+if [ "$#" -lt 1 ]
 then
-  echo "Please enter the name of the local branch you want to compile"
-  echo "USAGE: $0 <branch_name>" >&2
-  exit 1
+    echo "Please enter the name of the local branch you want to compile"
+    echo "USAGE: $0 <branch_name> [<iqtree_flags>]" >&2
+    exit 1
 fi
 
 
 #Determine hash code of current branch
 #branch=`git status | grep "On branch" | awk '{print $3}'`
 branch=$1
+flags=$2
+flagOMP="${flags} omp" # flags used to compile OpenMP version of IQ-TREE
+echo "COMPILING BRANCH ${branch} USING FLAGS ${flags}"
 #Take the first 6 characters of the current head commit
 commit_cur=`git log | head -n1 | awk '{print $2}' | cut -c 1-6`
 
-#Dictionary and binary names
-cur_build="build_${branch}"
-release_build="build_release"
-release_binary_prefix="iqtree_release"
-#cur_binary="iqtree_${commit_cur}"
-cur_binary="iqtree_${branch}"
-bin_dir="iqtree_binaries"
+#Assign names to build and binary directories
+flagSuffix=`echo ${flags} | sed 's/ /-/g'`
+buildDir="build-${branch}-${flagSuffix}"
+buildDirOMP="build-${branch}-${flagSuffix}-omp"
+binaryName="iqtree-${branch}"
+binaryNameOMP="${binaryName}-omp"
+binDir="iqtree-${branch}-bin"
 
-#Clean up
-if [ -e $cur_build ]
+#Create the build directory
+if [[ ! -e $buildDir ]]
 then
-  rm -rf $cur_build
+  mkdir $buildDir
 fi
-if [ -e $release_build ]
+if [[ ! -e $buildDirOMP ]]
 then
-  rm -rf $release_build 
+  mkdir $buildDirOMP
 fi
-if [ -e $bin_dir ]
+
+#Create binary directory
+if [[ ! -e $binDir ]]
 then
-  rm -rf $bin_dir
+    mkdir $binDir
 fi
-mkdir $bin_dir
-mkdir $cur_build 
+
 #Fetch changes from server
 git fetch
 curBranch=`git status | grep 'On branch' | awk '{print $3}'`
-if [ ${curBranch} != ${branch} ]
+if [[ ${curBranch} != ${branch} ]]
 then
-  git stash
-  git checkout $branch
-  git pull
-  git submodule update
+    echo "Switch to branch ${branch} and pull code from the server ... "
+    git stash
+    echo "Current changes stashed."
+    git checkout $branch
+    git pull
+    #git submodule update
 fi
-cmake -B${cur_build} -H..
-make -C ${cur_build} -j4
-cp ${cur_build}/iqtree ${bin_dir}/${cur_binary} 
-#rm -rf ${cur_build}
-mkdir $release_build
-#Find the hash code of the most recent release in master
-commit=`git log origin/master | grep -m 1 -B 4 "release version" | grep "commit" | awk '{print $2}'`
-version=`git log origin/master | grep -m 1 "release version [0-9]*" | awk '{print $3}'`
-git checkout ${commit}
-git submodule update
-cmake -B${release_build} -H..
-make -C ${release_build} -j4
-cp ${release_build}/iqtree ${bin_dir}/${release_binary_prefix}_${version}
-git checkout ${curBranch}
-git stash apply
-git submodule update
+
+#Build the selected
+
+echo -e "\nGENERATING MAKEFILE FOR SEQUENTIAL VERSION OF IQ-TREE FOR BRANCH ${branch}\n"
+cmake -B${buildDir} -H.. -DIQTREE_FLAGS="${flags}"
+echo -e "\nBUILDING SEQUENTIAL VERSION OF IQ-TREE FOR BRANCH ${branch}\n"
+make -C ${buildDir} -j4
+
+echo -e "\nGENERATING MAKEFILE FOR OPENMP VERSION OF IQ-TREE FOR BRANCH ${branch}\n"
+echo ${flagOMP}
+echo ${buildDirOMP}
+cmake -B${buildDirOMP} -H.. -DIQTREE_FLAGS="${flagOMP}"
+echo -e "\nBUILDING OPENMP VERSION OF IQ-TREE FOR BRANCH ${branch}\n"
+make -C ${buildDirOMP} -j4
+
+#cp ${buildDir}/iqtree- ${binDir}/${binaryName}
 
 #Clean up
-rm -rf $cur_build
-rm -rf $release_build 
+#rm -rf $buildDir
+
+#echo -e "Binaries of IQ-TREE for branch ${branch} are stored in $binDir"
+#rm -rf $release_build
diff --git a/test_scripts/gen_test_standard.py b/test_scripts/gen_test_standard.py
index dfe7f1e..a14bc02 100755
--- a/test_scripts/gen_test_standard.py
+++ b/test_scripts/gen_test_standard.py
@@ -2,9 +2,9 @@
 '''
 Created on Jan. 26, 2015
 
- at author: tung
+ at author: Tung Nguyen <nltung at gmail.com>
 '''
-import sys, os, time, multiprocessing, optparse 
+import sys, os, time, multiprocessing, optparse
 import subprocess, logging, datetime
 
 def parse_config(config_file):
@@ -22,15 +22,15 @@ def parse_config(config_file):
       readSingleAln = True
       continue
     if line == 'END_SINGLE_ALN':
-      readSingleAln = False 
+      readSingleAln = False
       continue
     if readSingleAln:
-      singleAln.append(line) 
+      singleAln.append(line)
     if line == 'START_PARTITION_ALN':
       readPartAln = True
       continue
     if line == 'END_PARTITION_ALN':
-      readPartAln = False 
+      readPartAln = False
       continue
     if readPartAln:
       partitionAln.append(line.split())
@@ -51,13 +51,15 @@ def parse_config(config_file):
     if genericOpt:
       genericOpts.append(line)
   return (singleAln, partitionAln, genericOpts, partOpts)
-      
+
 
 if __name__ == '__main__':
   usage = "USAGE: %prog [options]"
   parser = optparse.OptionParser(usage=usage)
   parser.add_option('-b','--binary', dest="iqtree_bin", help='Path to your IQ-TREE binary')
   parser.add_option('-c','--config', dest="config_file", help='Path to test configuration file')
+  parser.add_option('-o', '--output', dest="outFile", help='Output file for test cases')
+  parser.add_option('-f', '--flags', dest="flags", help='Additional flags for IQ-TREE')
   (options, args) = parser.parse_args()
   if not options.iqtree_bin or not options.config_file:
     parser.print_help()
@@ -67,26 +69,28 @@ if __name__ == '__main__':
   # Generate test commands for single model
   for aln in singleAln:
     for opt in genericOpts:
-      cmd = '-s ' + aln + ' ' + opt
-      testCmds.append(cmd)
+      cmd = '-s ' + aln + ' -redo ' + opt
+      if options.flags:
+        cmd = cmd + ' ' + options.flags
+        testCmds.append(cmd)
   # Generate test commands for partition model
   for aln in partitionAln:
     for opt in genericOpts:
       for partOpt in partOpts:
-        cmd = '-s ' + aln[0] + ' ' + opt + ' ' + partOpt + ' ' + aln[1]
+        cmd = '-s ' + aln[0] + ' -redo ' + opt + ' ' + partOpt + ' ' + aln[1]
+        if options.flags:
+            cmd = cmd + ' ' + options.flags
         testCmds.append(cmd)
+
   testNr = 1
   jobs = []
   for cmd in testCmds:
     testIDRel = os.path.basename(options.iqtree_bin) + "_TEST_" + str(testNr)
-    testCMD = testIDRel + " " + options.iqtree_bin + " -pre " + testIDRel + " " + cmd
-    testNr = testNr + 1 
+    testCMD = testIDRel + " " + os.path.abspath(options.iqtree_bin) + " -pre " + testIDRel + " " + cmd
+    testNr = testNr + 1
     jobs.append(testCMD)
 #  print "\n".join(jobs)
-  outfile = open(os.path.basename(options.iqtree_bin) + '_test_standard_cmds.txt', "wb")
+  outfile = open(options.outFile, "wb")
   for job in jobs:
     print >> outfile, job
   outfile.close()
-
-
-
diff --git a/test_scripts/generate_test_cmds.py b/test_scripts/generate_test_cmds.py
deleted file mode 100755
index 637eca9..0000000
--- a/test_scripts/generate_test_cmds.py
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/usr/bin/env python
-'''
-Created on Jan. 26, 2015
-
- at author: tung
-'''
-import sys, os, time, multiprocessing, optparse 
-import subprocess, logging, datetime
-
-def parse_config(config_file):
-  singleAln, partitionAln, partOpts, genericOpts = [], [], [], []
-  with open(config_file) as f:
-    #lines = f.readlines()
-    lines = [line.strip() for line in f if line.strip()]
-  readSingleAln = False
-  readPartAln = False
-  partOpt = False
-  genericOpt = False
-  for line in lines:
-    #print line
-    if line == 'START_SINGLE_ALN':
-      readSingleAln = True
-      continue
-    if line == 'END_SINGLE_ALN':
-      readSingleAln = False 
-      continue
-    if readSingleAln:
-      singleAln.append(line) 
-    if line == 'START_PARTITION_ALN':
-      readPartAln = True
-      continue
-    if line == 'END_PARTITION_ALN':
-      readPartAln = False 
-      continue
-    if readPartAln:
-      partitionAln.append(line.split())
-    if line == 'START_PARTITION_OPTIONS':
-      partOpt = True
-      continue
-    if line == 'END_PARTITION_OPTIONS':
-      partOpt = False
-      continue
-    if line == 'START_GENERIC_OPTIONS':
-      genericOpt = True
-      continue
-    if line == 'END_GENERIC_OPTIONS':
-      genericOpt = False
-      continue
-    if partOpt:
-      partOpts.append(line)
-    if genericOpt:
-      genericOpts.append(line)
-  return (singleAln, partitionAln, genericOpts, partOpts)
-      
-
-if __name__ == '__main__':
-  usage = "USAGE: %prog [options]"
-  parser = optparse.OptionParser(usage=usage)
-  parser.add_option('-r','--release', dest="release_bin", help='Path to release binary', default="iqtree_release")
-  parser.add_option('-t','--test', dest="test_bin", help='Path to test binary', default="iqtree_test")
-  parser.add_option('-c','--config', dest="config_file", help='Path to test configuration file')
-  parser.add_option('-o','--out_file', dest="out_file", help='Name of the output file', default="iqtree_test_cmds.txt")
-  (options, args) = parser.parse_args()
-  if len(sys.argv) == 1:
-    parser.print_help()
-    exit(0)
-  (singleAln, partitionAln, genericOpts, partOpts) = parse_config(options.config_file)
-  testCmds = []
-  # Generate test commands for single model
-  for aln in singleAln:
-    for opt in genericOpts:
-      cmd = '-s ' + aln + ' ' + opt
-      testCmds.append(cmd)
-  # Generate test commands for partition model
-  for aln in partitionAln:
-    for opt in genericOpts:
-      for partOpt in partOpts:
-        cmd = '-s ' + aln[0] + ' ' + opt + ' ' + partOpt + ' ' + aln[1]
-        testCmds.append(cmd)
-  testNr = 1
-  jobs = []
-  for cmd in testCmds:
-    testIDRel = options.release_bin + "_TEST_" + str(testNr)
-    release = testIDRel + " " + options.release_bin + " -pre " + testIDRel + " " + cmd
-    testIDTest = options.test_bin + "_TEST_" + str(testNr)
-    test = testIDTest + " " + options.test_bin + " -pre " + testIDTest + " " + cmd
-    testNr = testNr + 1 
-    jobs.append(release)
-    jobs.append(test)
-#  print "\n".join(jobs)
-  outfile = open(options.out_file, "wb")
-  for job in jobs:
-    print >> outfile, job
-  outfile.close()
-
-
-
diff --git a/test_scripts/run_tests.sh b/test_scripts/run_tests.sh
new file mode 100755
index 0000000..5db3c3c
--- /dev/null
+++ b/test_scripts/run_tests.sh
@@ -0,0 +1,50 @@
+#!/bin/bash -
+#===============================================================================
+#
+#          FILE: run_tests.sh
+#
+#         USAGE: ./run_tests.sh
+#
+#   DESCRIPTION:
+#
+#       OPTIONS: ---
+#  REQUIREMENTS: ---
+#          BUGS: ---
+#         NOTES: ---
+#        AUTHOR: Tung Nguyen (nltung at gmail.com),
+#  ORGANIZATION:
+#       CREATED: 2016-08-12 16:43:54 CEST
+#      REVISION:  ---
+#===============================================================================
+
+set -o nounset                              # Treat unset variables as an error
+
+if [ "$#" -lt 1 ]
+then
+    echo "Please enter the name of the local branch you want to compile"
+    echo "USAGE: $0 <branch_name> [<iqtree_flags_in_quotes>]" >&2
+    exit 1
+fi
+
+branchName=$1
+flags=$2
+
+if [ "$flags" == "" ]; then
+    flags="static"
+fi
+
+#Compile the specified branch
+source compile.sh ${branchName} "$flags"
+
+#Generate test cases
+echo -e "\nGENERATE TEST CASES FOR THE SEQUENTIAL VERSION\n"
+testCasesSeq="${branchName}-seq-test-cases.txt"
+./gen_test_standard.py -b ${buildDir}/iqtree -c test_configs.txt -o "${testCasesSeq}"
+echo "Test cases were writen to ${testCasesSeq}"
+./submit_jobs.sh 24 ${testCasesSeq} test_data test-results-${branchName}-seq
+
+echo -e "\nGENERATE TEST CASES FOR THE OMP VERSION\n"
+testCasesOMP="${branchName}-omp-test-cases.txt"
+./gen_test_standard.py -b ${buildDirOMP}/iqtree-omp -c test_configs.txt -o "${testCasesOMP}" -f "-nt 2"
+echo "Test cases were writen to ${testCasesOMP}"
+./submit_jobs.sh 48 ${testCasesOMP} test_data test-results-${branchName}-omp
diff --git a/test_scripts/submit_jobs.sh b/test_scripts/submit_jobs.sh
index 7b6f60c..62b1e7a 100755
--- a/test_scripts/submit_jobs.sh
+++ b/test_scripts/submit_jobs.sh
@@ -19,16 +19,15 @@
 
 set -o nounset                              # Treat unset variables as an error
 
-if [ $# -ne 5 ]
+if [ $# -ne 4 ]
 then
-  echo "USAGE: $0 <number_of_threads> <cmd_file> <aln_dir> <out_dir> <binary_dir>"
+  echo "USAGE: $0 <number_of_threads> <cmd_file> <aln_dir> <out_dir>" 
   exit 1
 fi
 numThreads=$1
 cmd_file=$2
 aln_dir=$3
 out_dir=$4
-binary_dir=$5
 
 if [ -d $out_dir ]
 then
@@ -37,9 +36,8 @@ fi
 mkdir $out_dir
 cp ${aln_dir}/* $out_dir
 cp $cmd_file $out_dir
-cp ${binary_dir}/* ${out_dir}/
 cd $out_dir
-submitCMD="submit2sge -N iqtree_system_test -q cluster -r zuseX -s $numThreads \"../jobmanager.py -f $cmd_file -c $numThreads\""
+submitCMD="submit2sge -N iqtree_system_test -s $numThreads \"../jobmanager.py -f $cmd_file -c $numThreads\""
 #echo "../jobmanager.py -f $cmd_file -c $numThreads" | qsub -V -S /bin/bash -cwd -j y -r y -N iqtree_system_test -l zuseX -l cluster -pe threads 16 -q q.norm at zuse02  
 $submitCMD
 cd ..
diff --git a/test_scripts/submitjob.sh b/test_scripts/submitjob.sh
deleted file mode 100755
index 660a4eb..0000000
--- a/test_scripts/submitjob.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-cd test_data
-echo "../jobmanager.py -f ../iqtree_test_cmds.txt -c 16" | qsub -V -S /bin/bash -cwd -j y -r y -N iqtree_system_test -l zuseX -l cluster -pe threads 16 -q q.norm at zuse02  
diff --git a/test_scripts/test_configs.txt b/test_scripts/test_configs.txt
index fbaa35e..b2bca5a 100644
--- a/test_scripts/test_configs.txt
+++ b/test_scripts/test_configs.txt
@@ -1,9 +1,12 @@
 START_PARTITION_ALN
 example.phy example.nex
 d59_8.phy d59_8.nex
+d69_31.phy d69_31.nex
 END_PARTITION_ALN
 
 START_SINGLE_ALN
+d59_8.phy
+d69_31.phy
 example.phy
 prot_M126_27_269.phy
 END_SINGLE_ALN
@@ -15,13 +18,10 @@ START_PARTITION_OPTIONS
 END_PARTITION_OPTIONS
 
 START_GENERIC_OPTIONS
--m TEST -n 1000 
--m TEST -bb 1000 -n 1000
--m TEST -alrt 1000 -n 1000
--m TEST -lbp 1000 -n 1000
--m TEST -bb 1000 -alrt 1000 -lbp 1000 -n 1000
--m TEST -b 10 -n 1000
+-m TEST
+-m TESTNEW
+-m TEST -bb 10000 -alrt 1000 -lbp 1000
+-m TESTNEW -bb 10000 -alrt 1000 -lbp 1000
+-m TEST -b 100
+-m TESTNEW -b 100
 END_GENERIC_OPTIONS
-
-
-
diff --git a/test_scripts/test_data/d59_8.nex b/test_scripts/test_data/d59_8.nex
index 80342c0..b0c9a2b 100644
--- a/test_scripts/test_data/d59_8.nex
+++ b/test_scripts/test_data/d59_8.nex
@@ -8,6 +8,4 @@ charset phyb3rd = 4572-5753;
 charset set5_8S = 5754-5913;
 charset its2 = 5914-6177;
 charset gbss13rd = 6178-6951;
-
-
 end;
diff --git a/tools.cpp b/tools.cpp
index 4613cc1..4e5af85 100644
--- a/tools.cpp
+++ b/tools.cpp
@@ -1,6 +1,8 @@
 /***************************************************************************
- *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
- *   minh.bui at univie.ac.at   *
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
@@ -20,14 +22,15 @@
 
 
 
-#if (defined(__GNUC__) || defined(__clang__)) && !defined(WIN32) && !defined(__CYGWIN__)
+#include "tools.h"
+#include "timeutil.h"
+#include "MPIHelper.h"
+
+#if defined(Backtrace_FOUND)
 #include <execinfo.h>
 #include <cxxabi.h>
 #endif
 
-#include "tools.h"
-#include "timeutil.h"
-
 VerboseMode verbose_mode;
 
 /*
@@ -640,6 +643,7 @@ void parseArg(int argc, char *argv[], Params &params) {
     verbose_mode = VB_MIN;
     params.tree_gen = NONE;
     params.user_file = NULL;
+    params.constraint_tree_file = NULL;
     params.opt_gammai = true;
     params.opt_gammai_fast = false;
     params.opt_gammai_keep_bran = false;
@@ -750,7 +754,7 @@ void parseArg(int argc, char *argv[], Params &params) {
     params.manuel_analytic_approx = false;
     params.leastSquareNNI = false;
     params.ls_var_type = OLS;
-    params.maxCandidates = 1000;
+    params.maxCandidates = 20;
     params.popSize = 5;
     params.p_delete = -1;
     params.min_iterations = -1;
@@ -777,6 +781,7 @@ void parseArg(int argc, char *argv[], Params &params) {
     params.num_rate_cats = 4;
     params.max_rate_cats = 10;
     params.gamma_shape = -1.0;
+    params.min_gamma_shape = MIN_GAMMA_SHAPE;
     params.gamma_median = false;
     params.p_invar_sites = -1.0;
     params.optimize_model_rate_joint = false;
@@ -790,6 +795,7 @@ void parseArg(int argc, char *argv[], Params &params) {
     params.iqp = false;
     params.write_intermediate_trees = 0;
 //    params.avoid_duplicated_trees = false;
+    params.writeDistImdTrees = false;
     params.rf_dist_mode = 0;
     params.mvh_site_rate = false;
     params.rate_mh_type = true;
@@ -801,12 +807,17 @@ void parseArg(int argc, char *argv[], Params &params) {
     params.aBayes_test = false;
     params.localbp_replicates = 0;
     params.SSE = LK_EIGEN_SSE;
-    params.lk_no_avx = false;
+    params.lk_no_avx = 0;
+    params.lk_safe_scaling = false;
+    params.numseq_safe_scaling = 2000;
     params.print_site_lh = WSL_NONE;
+    params.print_partition_lh = false;
     params.print_site_prob = WSL_NONE;
     params.print_site_state_freq = WSF_NONE;
     params.print_site_rate = false;
     params.print_trees_site_posterior = 0;
+    params.print_ancestral_sequence = AST_NONE;
+    params.min_ancestral_prob = 0.95;
     params.print_tree_lh = false;
     params.lambda = 1;
     params.speed_conf = 1.0;
@@ -854,6 +865,7 @@ void parseArg(int argc, char *argv[], Params &params) {
     params.step_iterations = 100;
 //    params.store_candidate_trees = false;
 	params.print_ufboot_trees = 0;
+    params.contree_rfdist = -1;
     //const double INF_NNI_CUTOFF = -1000000.0;
     params.nni_cutoff = -1000000.0;
     params.estimate_nni_cutoff = false;
@@ -871,7 +883,7 @@ void parseArg(int argc, char *argv[], Params &params) {
 #else
     params.pll = false;
 #endif
-    params.modeps = 0.01;
+    params.modelEps = 0.01;
     params.parbran = false;
     params.binary_aln_file = NULL;
     params.maxtime = 1000000;
@@ -881,9 +893,13 @@ void parseArg(int argc, char *argv[], Params &params) {
 //    params.autostop = true; // turn on auto stopping rule by default now
     params.unsuccess_iteration = 100;
     params.speednni = true; // turn on reduced hill-climbing NNI by default now
-    params.reduction = false;
     params.numInitTrees = 100;
-    params.fix_stable_splits = false;
+    params.fixStableSplits = false;
+    params.stableSplitThreshold = 0.9;
+    params.five_plus_five = false;
+    params.memCheck = false;
+    params.tabu = false;
+    params.adaptPertubation = false;
     params.numSupportTrees = 20;
 //    params.sprDist = 20;
     params.sprDist = 6;
@@ -894,7 +910,7 @@ void parseArg(int argc, char *argv[], Params &params) {
     params.site_freq_file = NULL;
     params.tree_freq_file = NULL;
 #ifdef _OPENMP
-    params.num_threads = 0;
+    params.num_threads = -1;
 #else
     params.num_threads = 1;
 #endif
@@ -913,7 +929,7 @@ void parseArg(int argc, char *argv[], Params &params) {
 	params.print_splits_file = false;
     params.ignore_identical_seqs = true;
     params.write_init_tree = false;
-    params.write_local_optimal_trees = false;
+    params.write_candidate_trees = false;
     params.freq_const_patterns = NULL;
     params.no_rescale_gamma_invar = false;
     params.compute_seq_identity_along_tree = false;
@@ -1055,8 +1071,7 @@ void parseArg(int argc, char *argv[], Params &params) {
 				params.find_all = true;
 				continue;
 			}
-			if (strcmp(argv[cnt], "-g") == 0
-					|| strcmp(argv[cnt], "--greedy") == 0) {
+			if (strcmp(argv[cnt], "--greedy") == 0) {
 				params.run_mode = GREEDY;
 				continue;
 			}
@@ -1867,7 +1882,9 @@ void parseArg(int argc, char *argv[], Params &params) {
 			}
 			if (strcmp(argv[cnt], "-lmd") == 0) {
 				cnt++;
-				params.lambda = convert_double(argv[cnt]);
+				if (cnt >= argc)
+					throw "Use -lmd <lambda>";
+                params.lambda = convert_double(argv[cnt]);
 				if (params.lambda > 1.0)
 					throw "Lambda must be in (0,1]";
 				continue;
@@ -1890,9 +1907,30 @@ void parseArg(int argc, char *argv[], Params &params) {
 				continue;
 			}
 			if (strcmp(argv[cnt], "-noavx") == 0) {
-				params.lk_no_avx = true;
+				params.lk_no_avx = 1;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nofma") == 0) {
+				params.lk_no_avx = 2;
 				continue;
 			}
+
+			if (strcmp(argv[cnt], "-safe") == 0) {
+				params.lk_safe_scaling = true;
+				continue;
+			}
+
+			if (strcmp(argv[cnt], "-safe-seq") == 0) {
+				cnt++;
+				if (cnt >= argc)
+                    throw "-safe-seq <number of sequences>";
+				params.numseq_safe_scaling = convert_int(argv[cnt]);
+                if (params.numseq_safe_scaling < 10)
+                    throw "Too small -safe-seq";
+				continue;
+			}
+
+
 			if (strcmp(argv[cnt], "-f") == 0) {
 				cnt++;
 				if (cnt >= argc)
@@ -1929,7 +1967,8 @@ void parseArg(int argc, char *argv[], Params &params) {
 				if (cnt >= argc)
 					throw "Use -ft <treefile_to_infer_site_frequency_model>";
                 params.tree_freq_file = argv[cnt];
-                params.print_site_state_freq = WSF_POSTERIOR_MEAN;
+                if (params.print_site_state_freq == WSF_NONE)
+                    params.print_site_state_freq = WSF_POSTERIOR_MEAN;
                 continue;
             }
 
@@ -1972,10 +2011,21 @@ void parseArg(int argc, char *argv[], Params &params) {
 				if (cnt >= argc)
 					throw "Use -a <gamma_shape>";
 				params.gamma_shape = convert_double(argv[cnt]);
-//				if (params.gamma_shape < 0)
-//					throw "Wrong number of gamma shape parameter (alpha)";
+				if (params.gamma_shape <= 0)
+					throw "Wrong gamma shape parameter (alpha)";
 				continue;
 			}
+
+			if (strcmp(argv[cnt], "-amin") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -amin <min_gamma_shape>";
+				params.min_gamma_shape = convert_double(argv[cnt]);
+				if (params.min_gamma_shape <= 0)
+					throw "Wrong minimum gamma shape parameter (alpha)";
+				continue;
+			}
+
 			if (strcmp(argv[cnt], "-gmean") == 0) {
 				params.gamma_median = false;
 				continue;
@@ -2133,15 +2183,27 @@ void parseArg(int argc, char *argv[], Params &params) {
 				params.iqp = true;
 				continue;
 			}
-			if (strcmp(argv[cnt], "-wlt") == 0) {
-				// write all candidate trees
-				params.write_local_optimal_trees = true;
+			if (strcmp(argv[cnt], "-wct") == 0) {
+				params.write_candidate_trees = true;
 				continue;
 			}
+
 			if (strcmp(argv[cnt], "-wt") == 0) {
 				params.write_intermediate_trees = 1;
 				continue;
 			}
+
+            if (strcmp(argv[cnt], "-wdt") == 0) {
+                params.writeDistImdTrees = true;
+                continue;
+            }
+
+            if (strcmp(argv[cnt], "-wtc") == 0) {
+                params.write_intermediate_trees = 1;
+                params.print_tree_lh = true;
+                continue;
+            }
+
 			if (strcmp(argv[cnt], "-wt2") == 0) {
 				params.write_intermediate_trees = 2;
 //				params.avoid_duplicated_trees = true;
@@ -2206,6 +2268,8 @@ void parseArg(int argc, char *argv[], Params &params) {
 			}
 			if (strcmp(argv[cnt], "-alrt") == 0) {
 				cnt++;
+				if (cnt >= argc)
+					throw "Use -alrt <#replicates | 0>";
                 int reps = convert_int(argv[cnt]);
                 if (reps == 0)
                     params.aLRT_test = true;
@@ -2222,6 +2286,8 @@ void parseArg(int argc, char *argv[], Params &params) {
 			}
 			if (strcmp(argv[cnt], "-lbp") == 0) {
 				cnt++;
+				if (cnt >= argc)
+					throw "Use -lbp <#replicates>";
 				params.localbp_replicates = convert_int(argv[cnt]);
 				if (params.localbp_replicates < 1000
 						&& params.localbp_replicates != 0)
@@ -2232,6 +2298,12 @@ void parseArg(int argc, char *argv[], Params &params) {
 				params.print_site_lh = WSL_SITE;
 				continue;
 			}
+
+			if (strcmp(argv[cnt], "-wpl") == 0) {
+				params.print_partition_lh = true;
+				continue;
+			}
+
 			if (strcmp(argv[cnt], "-wslg") == 0 || strcmp(argv[cnt], "-wslr") == 0) {
 				params.print_site_lh = WSL_RATECAT;
 				continue;
@@ -2260,6 +2332,28 @@ void parseArg(int argc, char *argv[], Params &params) {
 				continue;
 			}
 
+			if (strcmp(argv[cnt], "-asr") == 0) {
+				params.print_ancestral_sequence = AST_MARGINAL;
+                params.ignore_identical_seqs = false;
+				continue;
+			}
+
+			if (strcmp(argv[cnt], "-asr-min") == 0) {
+                cnt++;
+				if (cnt >= argc)
+					throw "Use -asr-min <probability>";
+                
+                params.min_ancestral_prob = convert_double(argv[cnt]);
+                if (params.min_ancestral_prob < 0.5 || params.min_ancestral_prob > 1)
+                    throw "Minimum ancestral probability [-asr-min] must be between 0.5 and 1.0";
+                continue;
+            }
+
+			if (strcmp(argv[cnt], "-asr-joint") == 0) {
+				params.print_ancestral_sequence = AST_JOINT;
+                params.ignore_identical_seqs = false;
+				continue;
+			}
 
 			if (strcmp(argv[cnt], "-wsr") == 0) {
 				params.print_site_rate = true;
@@ -2273,7 +2367,7 @@ void parseArg(int argc, char *argv[], Params &params) {
 				params.print_site_state_freq = WSF_POSTERIOR_MEAN;
 				continue;
 			}
-			if (strcmp(argv[cnt], "-wsfm") == 0) {
+			if (strcmp(argv[cnt], "-wsfm") == 0 || strcmp(argv[cnt], "-fmax") == 0) {
 				params.print_site_state_freq = WSF_POSTERIOR_MAX;
 				continue;
 			}
@@ -2524,12 +2618,28 @@ void parseArg(int argc, char *argv[], Params &params) {
 //				params.store_candidate_trees = false;
 //				continue;
 //			}
-			if (strcmp(argv[cnt], "-lhmemsave") == 0) {
-				params.lh_mem_save = LM_PER_NODE;
-				continue;
-			}
-			if (strcmp(argv[cnt], "-nolhmemsave") == 0) {
-				params.lh_mem_save = LM_ALL_BRANCH;
+			if (strcmp(argv[cnt], "-mem") == 0) {
+				cnt++;
+				if (cnt >= argc)
+                    throw "Use -mem max_mem_size";
+				params.lh_mem_save = LM_MEM_SAVE;
+                int end_pos;
+                double mem = convert_double(argv[cnt], end_pos);
+                if (mem < 0)
+                    throw "-mem must be non-negative";
+                if (argv[cnt][end_pos] == 'G') {
+                    params.max_mem_size = mem * 1073741824.0;
+                } else if (argv[cnt][end_pos] == 'M') {
+                    params.max_mem_size = mem * 1048576.0;
+                } else if (argv[cnt][end_pos] == '%'){
+                    params.max_mem_size = mem * 0.01;
+                    if (params.max_mem_size > 1)
+                        throw "-mem percentage must be between 0 and 100";
+                } else {
+                    if (mem > 1)
+                        throw "Invalid -mem option. Example: -mem 200M, -mem 10G";
+                    params.max_mem_size = mem;
+                }
 				continue;
 			}
 //			if (strcmp(argv[cnt], "-storetrees") == 0) {
@@ -2586,30 +2696,66 @@ void parseArg(int argc, char *argv[], Params &params) {
 				params.stop_condition = SC_REAL_TIME;
 				continue;
 			}
-			if (strcmp(argv[cnt], "-numpars") == 0) {
+			if (strcmp(argv[cnt], "-numpars") == 0 || strcmp(argv[cnt], "-ninit") == 0) {
 				cnt++;
 				if (cnt >= argc)
-					throw "Use -numpars <number_of_parsimony_trees>";
+					throw "Use -ninit <number_of_parsimony_trees>";
 				params.numInitTrees = convert_int(argv[cnt]);
+                if (params.numInitTrees < 0)
+                    throw "-ninit must be non-negative";
 				if (params.numInitTrees < params.numNNITrees)
 					params.numNNITrees = params.numInitTrees;
 				continue;
 			}
 			if (strcmp(argv[cnt], "-fss") == 0) {
-				params.fix_stable_splits = true;
+				params.fixStableSplits = true;
+//				params.five_plus_five = true;
+				continue;
+			}
+            if (strcmp(argv[cnt], "--stable-thres") == 0) {
+                cnt++;
+                if (cnt >= argc)
+                    throw "Use --stable-thres <support_value_threshold>";
+                params.stableSplitThreshold = convert_double(argv[cnt]);
+                continue;
+            }
+			if (strcmp(argv[cnt], "-ff") == 0) {
+				params.five_plus_five = true;
 				continue;
 			}
-			if (strcmp(argv[cnt], "-toppars") == 0) {
+
+			if (strcmp(argv[cnt], "-tabu") == 0) {
+                params.fixStableSplits = true;
+				params.tabu = true;
+                params.maxCandidates = params.numSupportTrees;
+				continue;
+			}
+
+            if (strcmp(argv[cnt], "--adt-pert") == 0) {
+                if (params.tabu == true) {
+                    outError("option -tabu and --adt-pert cannot be combined");
+                }
+                params.adaptPertubation = true;
+                params.stableSplitThreshold = 1.0;
+                continue;
+            }
+
+            if (strcmp(argv[cnt], "-memcheck") == 0) {
+                params.memCheck = true;
+                continue;
+            }
+
+			if (strcmp(argv[cnt], "-toppars") == 0 || strcmp(argv[cnt], "-ntop") == 0) {
 				cnt++;
 				if (cnt >= argc)
-					throw "Use -toppars <number_of_top_parsimony_trees>";
+					throw "Use -ntop <number_of_top_parsimony_trees>";
 				params.numNNITrees = convert_int(argv[cnt]);
 				continue;
 			}
-			if (strcmp(argv[cnt], "-nsp") == 0) {
+			if (strcmp(argv[cnt], "--num-sup-trees") == 0) {
 				cnt++;
 				if (cnt >= argc)
-					throw "Use -nsp <number_of_support_trees>";
+					throw "Use --num-sup-trees <number_of_support_trees>";
 				params.numSupportTrees = convert_int(argv[cnt]);
 				continue;
 			}
@@ -2664,10 +2810,10 @@ void parseArg(int argc, char *argv[], Params &params) {
 				continue;
 			}
 			if (strcmp(argv[cnt], "-popsize") == 0
-					|| strcmp(argv[cnt], "-numcand") == 0) {
+					|| strcmp(argv[cnt], "-numcand") == 0 || strcmp(argv[cnt], "-nbest") == 0) {
 				cnt++;
 				if (cnt >= argc)
-					throw "Use -numcand <number_of_candidate_trees>";
+					throw "Use -nbest <number_of_candidate_trees>";
 				params.popSize = convert_int(argv[cnt]);
 				assert(params.popSize < params.numInitTrees);
 				continue;
@@ -2689,10 +2835,10 @@ void parseArg(int argc, char *argv[], Params &params) {
 				cnt++;
 				if (cnt >= argc)
 					throw "Use -me <model_epsilon>";
-				params.modeps = convert_double(argv[cnt]);
-				if (params.modeps <= 0.0)
+				params.modelEps = convert_double(argv[cnt]);
+				if (params.modelEps <= 0.0)
 					throw "Model epsilon must be positive";
-				if (params.modeps > 0.1)
+				if (params.modelEps > 0.1)
 					throw "Model epsilon must not be larger than 0.1";
 				continue;
 			}
@@ -2704,10 +2850,7 @@ void parseArg(int argc, char *argv[], Params &params) {
 				params.speednni = false;
 				continue;
 			}
-			if (strcmp(argv[cnt], "-reduction") == 0) {
-				params.reduction = true;
-				continue;
-			}
+            
 			if (strcmp(argv[cnt], "-snni") == 0) {
 				params.snni = true;
 				// dont need to turn this on here
@@ -2720,17 +2863,21 @@ void parseArg(int argc, char *argv[], Params &params) {
 			if (strcmp(argv[cnt], "-iqpnni") == 0) {
 				params.snni = false;
 				params.start_tree = STT_BIONJ;
-				params.reduction = false;
 				params.numNNITrees = 1;
 //            continue; } if (strcmp(argv[cnt], "-auto") == 0) {
 //            	params.autostop = true;
 				continue;
 			}
-			if (strcmp(argv[cnt], "-stop_cond") == 0 || strcmp(argv[cnt], "-numstop") == 0) {
+			if (strcmp(argv[cnt], "-stop_cond") == 0 || strcmp(argv[cnt], "-numstop") == 0
+                 || strcmp(argv[cnt], "-nstop") == 0) {
 				if (params.stop_condition != SC_BOOTSTRAP_CORRELATION)
 					params.stop_condition = SC_UNSUCCESS_ITERATION;
 				cnt++;
+				if (cnt >= argc)
+					throw "Use -nstop <#iterations>";
 				params.unsuccess_iteration = convert_int(argv[cnt]);
+                if (params.unsuccess_iteration <= 0)
+                    throw "-nstop iterations must be positive";
 				continue;
 			}
 			if (strcmp(argv[cnt], "-lsbran") == 0) {
@@ -2881,10 +3028,14 @@ void parseArg(int argc, char *argv[], Params &params) {
 			if (strcmp(argv[cnt], "-omp") == 0 || strcmp(argv[cnt], "-nt") == 0) {
 				cnt++;
 				if (cnt >= argc)
-				throw "Use -nt <num_threads>";
-				params.num_threads = convert_int(argv[cnt]);
-				if (params.num_threads < 1)
-					throw "At least 1 thread please";
+				throw "Use -nt <num_threads|AUTO>";
+                if (strcmp(argv[cnt], "AUTO") == 0)
+                    params.num_threads = 0;
+                else {
+                    params.num_threads = convert_int(argv[cnt]);
+                    if (params.num_threads < 1)
+                        throw "At least 1 thread please";
+                }
 				continue;
 			}
 //			if (strcmp(argv[cnt], "-rootstate") == 0) {
@@ -2939,6 +3090,14 @@ void parseArg(int argc, char *argv[], Params &params) {
 				continue;
 			}
             
+            if (strcmp(argv[cnt], "-g") == 0) {
+                cnt++;
+                if (cnt >= argc)
+                    throw "Use -g <constraint_tree>";
+                params.constraint_tree_file = argv[cnt];
+                continue;
+            }
+            
 			if (strcmp(argv[cnt], "-lmap") == 0) {
 				cnt++;
 				if (cnt >= argc)
@@ -3012,6 +3171,10 @@ void parseArg(int argc, char *argv[], Params &params) {
 				continue;
 			}
 
+			if (strcmp(argv[cnt], "--no-uniqueseq") == 0) {
+				params.suppress_output_flags |= OUT_UNIQUESEQ;
+				continue;
+			}
 
 			if (argv[cnt][0] == '-') {
                 string err = "Invalid \"";
@@ -3051,6 +3214,13 @@ void parseArg(int argc, char *argv[], Params &params) {
         usage(argv, false);
 #endif
     }
+    
+    if (params.do_au_test && params.topotest_replicates == 0)
+        outError("For AU test please please specify number of bootstrap replicates via -zb option");
+
+    if (params.lh_mem_save == LM_MEM_SAVE && params.partition_file)
+        outError("-mem option does not work with partition models yet");
+
     if (!params.out_prefix) {
     	if (params.eco_dag_file)
     		params.out_prefix = params.eco_dag_file;
@@ -3065,6 +3235,12 @@ void parseArg(int argc, char *argv[], Params &params) {
         else
             params.out_prefix = params.user_file;
     }
+//    if (MPIHelper::getInstance().isWorker()) {
+    // BUG: setting out_prefix this way cause access to stack, which is cleaned up after returning from this function
+//        string newPrefix = string(params.out_prefix) + "."  + NumberToString(MPIHelper::getInstance().getProcessID()) ;
+//        params.out_prefix = (char *) newPrefix.c_str();
+//    }
+
 }
 
 extern void printCopyright(ostream &out);
@@ -3161,6 +3337,8 @@ void usage_iqtree(char* argv[], bool full_command) {
             << "  -v, -vv, -vvv        Verbose mode, printing more messages to screen" << endl
             << "  -quiet               Silent mode, suppress printing to screen (stdout)" << endl
             << "  -keep-ident          Keep identical sequences (default: remove & finally add)" << endl
+            << "  -safe                Safe likelihood kernel to avoid numerical underflow" << endl
+            << "  -mem RAM             Maximal RAM usage for memory saving mode" << endl
             << endl << "CHECKPOINTING TO RESUME STOPPED RUN:" << endl
             << "  -redo                Redo analysis even for successful runs (default: resume)" << endl
             << "  -cptime <seconds>    Minimum checkpoint time interval (default: 20)" << endl
@@ -3170,16 +3348,17 @@ void usage_iqtree(char* argv[], bool full_command) {
             << "  -wql                 Print quartet log-likelihoods to .quartetlh file" << endl
             << endl << "NEW STOCHASTIC TREE SEARCH ALGORITHM:" << endl
 //            << "  -pll                 Use phylogenetic likelihood library (PLL) (default: off)" << endl
-            << "  -numpars <number>    Number of initial parsimony trees (default: 100)" << endl
-            << "  -toppars <number>    Number of best parsimony trees (default: 20)" << endl
-            << "  -sprrad <number>     Radius for parsimony SPR search (default: 6)" << endl
-            << "  -numcand <number>    Size of the candidate tree set (defaut: 5)" << endl
+            << "  -ninit <number>      Number of initial parsimony trees (default: 100)" << endl
+            << "  -ntop <number>       Number of top initial trees (default: 20)" << endl
+            << "  -nbest <number>      Number of best trees retained during search (defaut: 5)" << endl
+            << "  -n <#iterations>     Fix number of iterations to <#iterations> (default: auto)" << endl
+            << "  -nstop <number>      Number of unsuccessful iterations to stop (default: 100)" << endl
             << "  -pers <proportion>   Perturbation strength for randomized NNI (default: 0.5)" << endl
+            << "  -sprrad <number>     Radius for parsimony SPR search (default: 6)" << endl
             << "  -allnni              Perform more thorough NNI search (default: off)" << endl
-            << "  -numstop <number>    Number of unsuccessful iterations to stop (default: 100)" << endl
-            << "  -n <#iterations>     Fix number of iterations to <#iterations> (default: auto)" << endl
-            << "  -iqp                 Use the IQP tree perturbation (default: randomized NNI)" << endl
-            << "  -iqpnni              Switch back to the old IQPNNI tree search algorithm" << endl
+            << "  -g <constraint_tree> (Multifurcating) topological constraint tree file" << endl
+//            << "  -iqp                 Use the IQP tree perturbation (default: randomized NNI)" << endl
+//            << "  -iqpnni              Switch back to the old IQPNNI tree search algorithm" << endl
             << endl << "ULTRAFAST BOOTSTRAP:" << endl
             << "  -bb <#replicates>    Ultrafast bootstrap (>=1000)" << endl
             << "  -wbt                 Write bootstrap trees to .ufboot file (default: none)" << endl
@@ -3259,6 +3438,7 @@ void usage_iqtree(char* argv[], bool full_command) {
             << "                       Invar, Gamma, Invar+Gamma, or FreeRate model where 'n' is" << endl
             << "                       number of categories (default: n=4)" << endl
             << "  -a <Gamma_shape>     Gamma shape parameter for site rates (default: estimate)" << endl
+            << "  -amin <min_shape>    Min Gamma shape parameter for site rates (default: 0.02)" << endl
             << "  -gmedian             Median approximation for +G site rates (default: mean)" << endl
             << "  --opt-gamma-inv      More thorough estimation for +I+G model parameters" << endl
             << "  -i <p_invar>         Proportion of invariable sites (default: estimate)" << endl
@@ -3268,7 +3448,8 @@ void usage_iqtree(char* argv[], bool full_command) {
             << endl << "SITE-SPECIFIC FREQUENCY MODEL:" << endl 
             << "  -ft <tree_file>      Input tree to infer site frequency model" << endl
             << "  -fs <in_freq_file>   Input site frequency model file" << endl
-            << "  -wsf                 Write site frequency model to .sitefreq file" << endl
+            << "  -fmax                Posterior maximum instead of posterior mean approximation" << endl
+            //<< "  -wsf                 Write site frequency model to .sitefreq file" << endl
             //<< "  -c <#categories>     Number of Gamma rate categories (default: 4)" << endl
 //            << endl << "TEST OF MODEL HOMOGENEITY:" << endl
 //            << "  -m WHTEST            Testing model (GTR+G) homogeneity assumption using" << endl
@@ -3308,6 +3489,12 @@ void usage_iqtree(char* argv[], bool full_command) {
             << "  -zb <#replicates>    Performing BP,KH,SH,ELW tests for trees passed via -z" << endl
             << "  -zw                  Also performing weighted-KH and weighted-SH tests" << endl
             << "  -au                  Also performing approximately unbiased (AU) test" << endl
+//            << endl << "ANCESTRAL SEQUENCE RECONSTRUCTION:" << endl
+//            << "  -asr                 Compute ancestral states by marginal reconstruction" << endl
+//            << "  -asr-min <prob>      Min probability to assign ancestral sequence (default: 0.95)" << endl
+//            << "  -wja                 Write ancestral sequences by joint reconstruction" << endl
+
+
             << endl;
 
 			cout << "GENERATING RANDOM TREES:" << endl;
@@ -3333,6 +3520,7 @@ void usage_iqtree(char* argv[], bool full_command) {
             << "  -wspr                Write site probabilities per rate category" << endl
             << "  -wspm                Write site probabilities per mixture class" << endl
             << "  -wspmr               Write site probabilities per mixture+rate class" << endl
+			<< "  -wpl                 Write partition log-likelihoods to .partlh file" << endl
             << "  -fconst f1,...,fN    Add constant patterns into alignment (N=#nstates)" << endl
             << "  -me <epsilon>        Logl epsilon for model parameter optimization (default 0.01)" << endl
             << "  --no-outfiles        Suppress printing output files" << endl;
@@ -3372,7 +3560,7 @@ void quickStartGuide() {
 #endif
          << "To show all available options: run 'iqtree -h'" << endl << endl
          << "Have a look at the tutorial and manual for more information:" << endl
-         << "     http://www.cibiv.at/software/iqtree" << endl << endl;
+         << "     http://www.iqtree.org" << endl << endl;
     exit(0);
 }
 
@@ -3659,10 +3847,12 @@ int random_int(int n, int *rstream) {
     return (int) floor(random_double(rstream) * n);
 } /* randominteger */
 
-//int randint(int a, int b) {
-//	return a + (RAND_MAX * rand() + rand()) % (b + 1 - a);
-//}
-//
+/* returns a random integer in the range [a; b] */
+int random_int(int a, int b) {
+	assert(b > a);
+	//return a + (RAND_MAX * rand() + rand()) % (b + 1 - a);
+	return a + random_int(b - a);
+}
 
 double random_double(int *rstream) {
 #ifndef FIXEDINTRAND
@@ -3802,6 +3992,8 @@ void trimString(string &str) {
     str.erase(str.find_last_not_of(" \n\r\t")+1);
 }
 
+
+
 Params& Params::getInstance() {
     static Params instance;
     return instance;
@@ -3819,6 +4011,7 @@ int countPhysicalCPUCores() {
 #else
     logicalcpucount = sysconf( _SC_NPROCESSORS_ONLN );
 #endif
+    if (logicalcpucount < 1) logicalcpucount = 1;
     return logicalcpucount;
     
     if (logicalcpucount % 2 != 0)
@@ -3837,6 +4030,7 @@ int countPhysicalCPUCores() {
     } else {
         physicalcpucount = logicalcpucount;
     }
+    if (physicalcpucount < 1) physicalcpucount = 1;
     return physicalcpucount;
 }
 
@@ -3845,7 +4039,7 @@ int countPhysicalCPUCores() {
 
 /** Print a demangled stack backtrace of the caller function to FILE* out. */
 
-#if  defined(WIN32) || defined(__CYGWIN__) 
+#if  !defined(Backtrace_FOUND)
 
 // donothing for WIN32
 void print_stacktrace(ostream &out, unsigned int max_frames) {}
@@ -3976,10 +4170,19 @@ void print_stacktrace(ostream &out, unsigned int max_frames)
 
 }
 
-#endif // WIN32
+#endif // Backtrace_FOUND
 
 bool memcmpcpy(void * destination, const void * source, size_t num) {
     bool diff = (memcmp(destination, source, num) != 0);
     memcpy(destination, source, num);
     return diff;
 }
+
+// Pairing function: see https://en.wikipedia.org/wiki/Pairing_function
+int pairInteger(int int1, int int2) {
+    if (int1 <= int2) {
+        return ((int1 + int2)*(int1 + int2 + 1)/2 + int2);
+    } else {
+        return ((int1 + int2)*(int1 + int2 + 1)/2 + int1);
+    }
+}
diff --git a/tools.h b/tools.h
index 480648d..2e2f380 100644
--- a/tools.h
+++ b/tools.h
@@ -1,6 +1,8 @@
 /***************************************************************************
- *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
- *   minh.bui at univie.ac.at   *
+ *   Copyright (C) 2009-2015 by                                            *
+ *   BUI Quang Minh <minh.bui at univie.ac.at>                                *
+ *   Lam-Tung Nguyen <nltung at gmail.com>                                    *
+ *                                                                         *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
@@ -32,6 +34,8 @@
 #include <stdlib.h>
 #include <math.h>
 #include <stdint.h>
+#include <string.h>
+#include <sstream>
 
 //#include <sys/time.h>
 //#include <time.h>
@@ -61,10 +65,10 @@ inline void _my_assert(const char* expression, const char *func, const char* fil
 
 #define USE_HASH_MAP
 
-#ifdef __GNUC__
+#if defined(__GNUC__) && !defined(GCC_VERSION)
 #define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#else
-#define GCC_VERSION 0
+//#else
+//#define GCC_VERSION 0
 #endif
 
 // for MSVC
@@ -73,13 +77,23 @@ inline void _my_assert(const char* expression, const char *func, const char* fil
 #endif
 
 #if defined(USE_HASH_MAP)
+//    #include <unordered_map>
+//    #include <unordered_set>
+
 	#if defined(_MSC_VER)
 		#include <unordered_map>
 		#include <unordered_set>
     #elif defined(__clang__)
-		#include <tr1/unordered_map>
-		#include <tr1/unordered_set>
-		using namespace std::tr1;    
+        // libc++ detected:     _LIBCPP_VERSION
+        // libstdc++ detected:  __GLIBCXX__
+        #if __has_include(<unordered_map>) // defines _LIBCPP_VERSION
+            #include <unordered_map>
+            #include <unordered_set>
+        #else
+            #include <tr1/unordered_map>
+            #include <tr1/unordered_set>
+            using namespace std::tr1;    
+        #endif
 	#elif !defined(__GNUC__)
 		#include <hash_map>
 		#include <hash_set>
@@ -95,6 +109,7 @@ inline void _my_assert(const char* expression, const char *func, const char* fil
 		#include <tr1/unordered_set>
 		using namespace std::tr1;
 	#endif
+
 #else
 	#include <map>
 	#include <set>
@@ -222,7 +237,7 @@ typedef vector<string> StrVector;
 /**
         matrix of double number
  */
-#define matrix(T) vector<vector<T> >
+#define matrix(T) vector< vector<T> >
 
 /**
         matrix of double
@@ -287,6 +302,7 @@ const int WT_NEWLINE = 128;
 const int WT_BR_LEN_FIXED_WIDTH = 256;
 const int WT_BR_ID = 512;
 const int WT_BR_LEN_ROUNDING = 1024;
+const int WT_BR_LEN_SHORT = 2048; // store only 6 digits after the comma for branch lengths
 const int TRUE = 1;
 const int FALSE = 0;
 
@@ -414,7 +430,7 @@ enum LikelihoodKernel {
 };
 
 enum LhMemSave {
-	LM_DETECT, LM_ALL_BRANCH, LM_PER_NODE
+	LM_PER_NODE, LM_MEM_SAVE
 };
 
 enum SiteLoglType {
@@ -425,6 +441,10 @@ enum SiteFreqType {
     WSF_NONE, WSF_POSTERIOR_MEAN, WSF_POSTERIOR_MAX
 };
 
+enum AncestralSeqType {
+    AST_NONE, AST_MARGINAL, AST_JOINT
+};
+
 const int BRLEN_OPTIMIZE = 0; // optimize branch lengths
 const int BRLEN_FIX      = 1; // fix branch lengths
 const int BRLEN_SCALE    = 2; // scale branch lengths
@@ -432,6 +452,15 @@ const int BRLEN_SCALE    = 2; // scale branch lengths
 const int OUT_LOG       = 1; // .log file written or not
 const int OUT_TREEFILE  = 2; // .treefile file written or not
 const int OUT_IQTREE    = 4; // .iqtree file written or not
+const int OUT_UNIQUESEQ = 8; // .uniqueseq file written or not
+
+
+const double MIN_GAMMA_RATE = 1e-6;
+// change from 0.01 to 0.02 as 0.01 causes numerical problems
+const double MIN_GAMMA_SHAPE = 0.02;
+const double MAX_GAMMA_SHAPE = 1000.0;
+const double TOL_GAMMA_SHAPE = 0.001;
+
 
 /** maximum number of newton-raphson steps for NNI branch evaluation */
 extern int NNI_MAX_NR_STEP;
@@ -456,6 +485,32 @@ private:
 public:
 
     /**
+    *  Fast and accurate optimiation for alpha and p_invar
+    */
+    bool fai;
+
+    /**
+     *  Option to check memory consumption only
+     */
+    bool memCheck;
+
+    /**
+     *  The support threshold for stable splits (Default = 0.9)
+     */
+    double stableSplitThreshold;
+
+    /**
+     *  Option for adaptive perturbation.
+     *  Branches that are shared among all candidate trees will be perturbed
+     */
+    bool adaptPertubation;
+
+	/**
+	 *  Option to do mutlipe start for estimating alpha and p_invar
+	 */
+	bool testAlpha;
+
+    /**
      *  Restart the optimization of alpha and pinvar from different starting
      *  pinv values (supercedes the option testAlpha
      */
@@ -491,17 +546,30 @@ public:
     bool exh_ai;
 
 	/**
-	 *  User file contains the alpha and invar parameters
+	 *  Text file contain all pairs of alpha and p_invar to
+	 *  evaluate.
+	 *  TODO Remove this option and implement the exhaustive search
+	 *  directly into IQ-TREE
 	 */
 	char* alpha_invar_file;
 
 	/**
+	 *  Enable tabu search for NNI
+	 */
+	bool tabu;
+
+    /**
+	 *  Use (5+5)-ES strategy
+	 */
+	bool five_plus_five;
+
+	/**
 	 * Turn on feature to identify stable splits and fix them during tree search
 	 */
-	bool fix_stable_splits;
+	bool fixStableSplits;
 
 	/**
-	 *  Number of distinct locally optimal trees
+	 *  Number of best trees used to compute stable splits
 	 */
 	int numSupportTrees;
 
@@ -523,11 +591,14 @@ public:
 
 	/**
 	 *  Number of best trees in the candidate set used to generate perturbed trees
+	 *  In term of evolutionary algorithm, this is the population size
 	 */
 	int popSize;
 
 	/**
 	 *  Maximum number of trees stored in the candidate tree set
+	 *  This is just a technical constraint to ensure that the candidate tree set
+	 *  does not have to store a lot of trees
 	 */
 	int maxCandidates;
 
@@ -536,10 +607,6 @@ public:
 	 */
 	bool speednni;
 
-	/**
-	 *  use reduction technique to constraint tree space
-	 */
-	bool reduction;
 
 	/**
 	 *  portion of NNI used for perturbing the tree
@@ -549,7 +616,7 @@ public:
 	/**
 	 *  logl epsilon for model parameter optimization
 	 */
-	double modeps;
+	double modelEps;
 
 	/**
 	 *  New search heuristics (DEFAULT: ON)
@@ -672,6 +739,9 @@ public:
     /* type of starting tree */
     START_TREE_TYPE start_tree;
 
+    /** name of constraint tree file in NEWICK format */
+    char *constraint_tree_file;
+
     /**
             prefix of the output file, default is the same as input file
      */
@@ -1227,6 +1297,11 @@ public:
     double gamma_shape;
 
     /**
+            minimum shape parameter (alpha) of the Gamma distribution for site rates
+     */
+    double min_gamma_shape;
+
+    /**
             TRUE to use median rate for discrete categories, FALSE to use mean rate instead
      */
     bool gamma_median;
@@ -1299,15 +1374,23 @@ public:
     char *bootstrap_spec;
 
     /**
-            1 if output all intermediate trees from every IQPNNI iteration
+            1 if output all intermediate trees (initial trees, NNI-optimal trees and trees after each NNI step)
             2 if output all intermediate trees + 1-NNI-away trees
      */
     int write_intermediate_trees;
 
     /**
-     *  Write out all candidate trees (the locally optimal trees)
+     *  Write all distinct intermediate trees and there likelihoods
+     *  Note: intermediate trees are trees that have been visited by the search. These include trees created by
+     *  NNI-steps within each NNI iteration.
      */
-    int write_local_optimal_trees;
+    bool writeDistImdTrees;
+
+    /**
+     *  Write trees obtained at the end of each NNI search
+     */
+    bool write_candidate_trees;
+
 
     /**
         TRUE to avoid duplicated trees while writing intermediate trees
@@ -1368,7 +1451,13 @@ public:
     LikelihoodKernel SSE;
 
     /** TRUE to not use AVX even available in CPU, default: FALSE */
-    bool lk_no_avx;
+    int lk_no_avx;
+
+    /** TRUE for safe numerical scaling (per category; used for large trees), default: FALSE */
+    bool lk_safe_scaling;
+
+    /** minimum number of sequences to always use safe scaling, default: 2000 */
+    int numseq_safe_scaling;
 
     /**
      	 	WSL_NONE: do not print anything
@@ -1376,9 +1465,13 @@ public:
             WSL_RATECAT: print site log-likelihood per rate category
             WSL_MIXTURE: print site log-likelihood per mixture class
             WSL_MIXTURE_RATECAT: print site log-likelihood per mixture class per rate category
+            WSL_STATE: print site log-likelihood per state
      */
     SiteLoglType print_site_lh;
 
+    /** TRUE to print partition log-likelihood, default: FALSE */
+    bool print_partition_lh;
+
     /**
         control printing posterior probability of each site belonging to a rate/mixture categories
         same meaning as print_site_lh, but results are printed to .siteprob file
@@ -1389,6 +1482,16 @@ public:
     SiteLoglType print_site_prob;
 
     /**
+        AST_NONE: do not print ancestral sequences (default)
+        AST_MARGINAL: print ancestral sequences by marginal reconstruction
+        AST_JOINT: print ancestral sequences by joint reconstruction
+    */
+    AncestralSeqType print_ancestral_sequence;
+
+    /** minimum probability to assign an ancestral state */
+    double min_ancestral_prob;
+
+    /**
         0: print nothing
         1: print site state frequency vectors
     */
@@ -1620,6 +1723,8 @@ public:
 	/** true to print all UFBoot trees to a file */
 	int print_ufboot_trees;
 
+    int contree_rfdist;
+
     /****** variables for NNI cutoff heuristics ******/
 
     /**
@@ -1722,6 +1827,9 @@ public:
 	 * 1: only store 1 partial likelihood vector per node */
 	LhMemSave lh_mem_save;
 
+    /** maximum size of memory allowed to use */
+    double max_mem_size;
+
 	/* TRUE to print .splits file in star-dot format */
 	bool print_splits_file;
     
@@ -1733,7 +1841,6 @@ public:
 
     /** frequencies of const patterns to be inserted into alignment */
     char *freq_const_patterns;
-
     /** BQM 2015-02-25: true to NOT rescale Gamma+Invar rates by (1-p_invar) */
     bool no_rescale_gamma_invar;
 
@@ -1979,7 +2086,7 @@ int64_t convert_int64(const char *str) throw (string);
         @param end_pos end position
         @return the number
  */
-int64_t convert_int64(const char *str, int64_t &end_pos) throw (string);
+int64_t convert_int64(const char *str, int &end_pos) throw (string);
 
 /**
         convert string to double, with error checking
@@ -2400,4 +2507,29 @@ inline uint32_t popcount_lauradoux(unsigned *buf, int n) {
  */
 bool memcmpcpy(void * destination, const void * source, size_t num);
 
+/**
+ *  Generating a unique integer from a pair of 2 integer
+ *  This method is called cantor pairing function (see wikepedia).
+ *  @param int1 the first integer
+ *  @param int2 the second integer
+ *  @return the encoding of the 2 integer
+ */
+int pairInteger(int int1, int int2);
+
+template <typename T>
+string NumberToString ( T Number )
+{
+    ostringstream ss;
+    ss << Number;
+    return ss.str();
+}
+
+template <typename T>
+T StringToNumber ( const string &Text )
+{
+    istringstream ss(Text);
+    T result;
+    return ss >> result ? result : 0;
+}
+
 #endif
diff --git a/vectorclass/changelog.txt b/vectorclass/changelog.txt
index a27dad3..4018a1b 100755
--- a/vectorclass/changelog.txt
+++ b/vectorclass/changelog.txt
@@ -1,18 +1,31 @@
 change log for vectorclass.zip
 ------------------------------
-version 1.20
-  * round functions: suppress precision exception under SSE4.1 and higher
 
+2016-09-27 version 1.23
+  * temporary fix of a problem in Clang version 3.9 inserted in vectorf128.h
+
+2016-05-03 version 1.22
+  * added optional namespace
+  * fixed problem with decimal.h  
+
+2016-04-24 version 1.21
+  * fix problems with XOP option in gcc
+  * improved horizontal_and/or for sse2
+  * improved Vec2q and Vec4q constructor on Microsoft Visual Studio 2015
+  * removed warnings by gcc option -Wcast-qual  
+
+2015-12-04 version 1.20
+  * round functions: suppress precision exception under SSE4.1 and higher
+  * fix compiler problems with AVX512 multiplication in gcc version 5.1
+  * fix compiler problems with pow function in Microsoft Visual Studio 2015
 
 2015-11-14 version 1.19
   * fix various problems with Clang compiler
 
-
 2015-09-25 version 1.18
   * fix compiler error for Vec8s divide_by_i(Vec8s const & x) under Clang compiler
   * fix error in Vec4d::size() in vectorf256e.h
 
-
 2015-07-31 version 1.17
   * improved operator > for Vec4uq
   * more special cases in blend4q
@@ -21,7 +34,6 @@ version 1.20
     with macro named BTYPE in winnt.h
   * fixed bug in Vec4db constructor
 
-
 2014-10-24 version 1.16
   * workaround for problem in Clang compiler extended to version 3.09 because not fixed yet by Clang
     (vectorf128.h line 134)
@@ -31,7 +43,6 @@ version 1.20
   * manual discusses dynamic allocation of arrays of vectors
   * various minor changes
 
-
 2014-10-17 version 1.15
   * added files ranvec1.h and ranvec1.cpp for random number generator
   * constructors to make boolean vectors from their elements
@@ -44,7 +55,6 @@ version 1.20
   * explicit fused multiply-and-add used in math functions to improve performance 
     on compilers that don't automatically insert FMA
 
-
 2014-07-24 version 1.14
   * support for AVX-512f instruction set and 512-bit vectors:
     Vec16i, Vec16ui, Vec8q, Vec8uq, Vec16f, Vec8d, and corresponding boolean vectors
@@ -56,14 +66,12 @@ version 1.20
   * improved precision in exp2 and exp10 functions
   * various bug fixes
 
-
 2014-05-11 version 1.13
   * pow function improved
   * mul_add, mul_sub, mul_sub_x functions
   * propagation of error codes through nan_code function
   * "denormal" renamed to "subnormal" everywhere, in accordance with IEEE 754-2008 standard
 
-
 2014-04-20 version 1.12
   * inline implementation of mathematical functions added (vectormath_exp.h vectormath_trig.h vectormath_common.h)
   * vectormath.h renamed to vectormath_lib.h because a new alternative is added
@@ -84,11 +92,9 @@ version 1.20
   * removed signalling nan function
   * minor improvements in various blend and lookup functions
 
-
 2014-03-01 version 1.11
   * fixed missing unsigned operators >>= in vectori256.h
 
-
 2013-10-04 version 1.10
   * clear distinction between boolean vectors and integer vectors for the sake of 
     compatibility with mask registers in forthcoming AVX512 instruction set
@@ -101,11 +107,9 @@ version 1.20
   * workaround problem in MS Visual Studio 11.0. Bug report 735861 and 804274
   * minor bug fixes
 
-
 2013-03-31 version 1.03 beta
   * bug fix for Vec2d cos (Vec2d const & x), VECTORMATH = 1
 
-
 2012-08-01 version 1.02 beta
   * added file vector3d.h for 3-dimensional vectors
   * added file complexvec.h for complex numbers and complex vectors
@@ -113,7 +117,6 @@ version 1.20
   * added function change_sign for floating point vectors
   * added operators +, -, *, / between floating point vectors and scalars to remove overloading ambiguity
 
-
 2012-07-08 version 1.01 beta
   * added file decimal.h with Number <-> string conversion functions: 
     bin2bcd, bin2ascii, bin2hex_ascii, ascii2bin
@@ -126,6 +129,5 @@ version 1.20
   * minor improvement in abs function
   * added version number to VECTORCLASS_H
 
-
 2012-05-30 version 1.00 beta
   * first public release
diff --git a/vectorclass/dispatch_example.cpp b/vectorclass/dispatch_example.cpp
index 640a683..4fd05e3 100755
--- a/vectorclass/dispatch_example.cpp
+++ b/vectorclass/dispatch_example.cpp
@@ -1,8 +1,8 @@
 /*************************  dispatch_example.cpp   ****************************
 | Author:        Agner Fog
 | Date created:  2012-05-30
-| Last modified: 2014-07-23
-| Version:       1.14
+* Last modified: 2016-04-26
+* Version:       1.22
 | Project:       vector classes
 | Description:
 | Example of CPU dispatching.
@@ -17,7 +17,7 @@
 | g++ -O3 -msse2 -otest instrset_detect.cpp d2.o d5.o d7.o d8.o d9.o
 | ./test
 |
-| (c) Copyright 2012 - 2014 GNU General Public License http://www.gnu.org/licenses
+| (c) Copyright 2012-2016 GNU General Public License http://www.gnu.org/licenses
 \*****************************************************************************/
 
 #include <stdio.h>
@@ -25,6 +25,9 @@
 #define MAX_VECTOR_SIZE 512
 #include "vectorclass.h"
 
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
 
 // define function type (change this to fit your purpose. Should not contain vector types)
 typedef float MyFuncType(float*);
@@ -97,3 +100,6 @@ int main(int argc, char* argv[])
 
 #endif  // INSTRSET == 2
 
+#ifdef VCL_NAMESPACE
+}
+#endif
diff --git a/vectorclass/instrset.h b/vectorclass/instrset.h
index 4fb83e2..fb3ebf6 100755
--- a/vectorclass/instrset.h
+++ b/vectorclass/instrset.h
@@ -1,8 +1,8 @@
 /****************************  instrset.h   **********************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2014-10-22
-* Version:       1.16
+* Last modified: 2016-05-02
+* Version:       1.22
 * Project:       vector classes
 * Description:
 * Header file for various compiler-specific tasks and other common tasks to 
@@ -14,11 +14,11 @@
 * > defines template class to represent compile-time integer constant
 * > defines template for compile-time error messages
 *
-* (c) Copyright 2012 - 2014 GNU General Public License www.gnu.org/licenses
+* (c) Copyright 2012 - 2016 GNU General Public License www.gnu.org/licenses
 ******************************************************************************/
 
 #ifndef INSTRSET_H
-#define INSTRSET_H 116
+#define INSTRSET_H 122
 
 // Detect 64 bit mode
 #if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64) ) && ! defined(__x86_64__)
@@ -26,7 +26,7 @@
 #endif
 
 // Find instruction set from compiler macros if INSTRSET not defined
-// Note: Microsoft compilers do not define these macros automatically
+// Note: Most of these macros are not defined in Microsoft compilers
 #ifndef INSTRSET
 #if defined ( __AVX512F__ ) || defined ( __AVX512__ ) // || defined ( __AVX512ER__ ) 
 #define INSTRSET 9
@@ -107,7 +107,7 @@
 // FMA4 instruction set
 #if defined (__FMA4__) && (defined(__GNUC__) || defined(__clang__))
 #include <fma4intrin.h> // must have both x86intrin.h and fma4intrin.h, don't know why
-#endif // __FMA4__ 
+#endif // __FMA4__
 
 
 // Define integer types with known size
@@ -156,10 +156,16 @@
 #endif // _MSC_VER
 
 // functions in instrset_detect.cpp
-int  instrset_detect(void);                      // tells which instruction sets are supported
-bool hasFMA3(void);                              // true if FMA3 instructions supported
-bool hasFMA4(void);                              // true if FMA4 instructions supported
-bool hasXOP (void);                              // true if XOP  instructions supported
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+    int  instrset_detect(void);                      // tells which instruction sets are supported
+    bool hasFMA3(void);                              // true if FMA3 instructions supported
+    bool hasFMA4(void);                              // true if FMA4 instructions supported
+    bool hasXOP(void);                              // true if XOP  instructions supported
+#ifdef VCL_NAMESPACE
+}
+#endif
 
 // GCC version
 #if defined(__GNUC__) && !defined (GCC_VERSION) && !defined (__clang__)
@@ -174,7 +180,7 @@ bool hasXOP (void);                              // true if XOP  instructions su
 // Apple bug 18746972
 #endif
 
-// Fix problem with macros named min and max in WinDef.h
+// Fix problem with non-overloadable macros named min and max in WinDef.h
 #ifdef _MSC_VER
 #if defined (_WINDEF_) && defined(min) && defined(max)
 #undef min
@@ -185,19 +191,25 @@ bool hasXOP (void);                              // true if XOP  instructions su
 #endif
 #endif
 
-// Template class to represent compile-time integer constant
-template <int32_t  n> class Const_int_t  {};     // represent compile-time signed integer constant
-template <uint32_t n> class Const_uint_t {};     // represent compile-time unsigned integer constant
-#define const_int(n)  (Const_int_t <n>())        // n must be compile-time integer constant
-#define const_uint(n) (Const_uint_t<n>())        // n must be compile-time unsigned integer constant
-
-// Template for compile-time error messages
-template <bool> class Static_error_check {
-    public:  Static_error_check(){};
-};
-template <> class Static_error_check<false> {    // generate compile-time error if false
-    private: Static_error_check(){};
-};
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+    // Template class to represent compile-time integer constant
+    template <int32_t  n> class Const_int_t {};       // represent compile-time signed integer constant
+    template <uint32_t n> class Const_uint_t {};      // represent compile-time unsigned integer constant
+    #define const_int(n)  (Const_int_t <n>())         // n must be compile-time integer constant
+    #define const_uint(n) (Const_uint_t<n>())         // n must be compile-time unsigned integer constant
+
+    // Template for compile-time error messages
+    template <bool> class Static_error_check {
+    public:  Static_error_check() {};
+    };
+    template <> class Static_error_check<false> {     // generate compile-time error if false
+    private: Static_error_check() {};
+    };
+#ifdef VCL_NAMESPACE
+}
+#endif 
 
 
 #endif // INSTRSET_H
diff --git a/vectorclass/instrset_detect.cpp b/vectorclass/instrset_detect.cpp
index 03c5777..5023d08 100755
--- a/vectorclass/instrset_detect.cpp
+++ b/vectorclass/instrset_detect.cpp
@@ -1,26 +1,30 @@
 /**************************  instrset_detect.cpp   ****************************
 | Author:        Agner Fog
 | Date created:  2012-05-30
-| Last modified: 2014-07-23
-| Version:       1.14
+* Last modified: 2016-04-26
+* Version:       1.22
 | Project:       vector classes
 | Description:
 | Functions for checking which instruction sets are supported.
 |
-| (c) Copyright 2012 - 2014 GNU General Public License http://www.gnu.org/licenses
+| (c) Copyright 2012-2016 GNU General Public License http://www.gnu.org/licenses
 \*****************************************************************************/
 
 #include "instrset.h"
 
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
 // Define interface to cpuid instruction.
 // input:  eax = functionnumber, ecx = 0
 // output: eax = output[0], ebx = output[1], ecx = output[2], edx = output[3]
 static inline void cpuid (int output[4], int functionnumber) {	
-#if defined (_MSC_VER) || defined (__INTEL_COMPILER)       // Microsoft or Intel compiler, intrin.h included
+#if defined (_MSC_VER) //|| defined (__INTEL_COMPILER)       // Microsoft or Intel compiler, intrin.h included
 
     __cpuidex(output, functionnumber, 0);                  // intrinsic function for CPUID
 
-#elif defined(__GNUC__) || defined(__clang__)              // use inline assembly, Gnu/AT&T syntax
+#elif defined(__GNUC__) || defined(__clang__) || defined (__INTEL_COMPILER)              // use inline assembly, Gnu/AT&T syntax
 
    int a, b, c, d;
    __asm("cpuid" : "=a"(a),"=b"(b),"=c"(c),"=d"(d) : "a"(functionnumber),"c"(0) : );
@@ -151,3 +155,7 @@ bool hasXOP(void) {
     cpuid(abcd, 0x80000001);                               // call cpuid function 0x80000001
     return ((abcd[2] & (1 << 11)) != 0);                   // ecx bit 11 indicates XOP
 }
+
+#ifdef VCL_NAMESPACE
+}
+#endif
diff --git a/vectorclass/special.zip b/vectorclass/special.zip
index 3f3ce57..47abdbe 100755
Binary files a/vectorclass/special.zip and b/vectorclass/special.zip differ
diff --git a/vectorclass/vectorclass.h b/vectorclass/vectorclass.h
index 0368ef8..426f6b9 100755
--- a/vectorclass/vectorclass.h
+++ b/vectorclass/vectorclass.h
@@ -1,8 +1,8 @@
 /****************************  vectorclass.h   ********************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2015-11-07
-* Version:       1.19
+* Last modified: 2016-09-27
+* Version:       1.23
 * Project:       vector classes
 * Description:
 * Header file defining vector classes as interface to intrinsic functions 
@@ -22,10 +22,10 @@
 *
 * For detailed instructions, see VectorClass.pdf
 *
-* (c) Copyright 2012 - 2015 GNU General Public License www.gnu.org/licenses
+* (c) Copyright 2012-2016 GNU General Public License www.gnu.org/licenses
 ******************************************************************************/
 #ifndef VECTORCLASS_H
-#define VECTORCLASS_H  116
+#define VECTORCLASS_H  123
 
 // Maximum vector size, bits. Allowed values are 128, 256, 512
 #ifndef MAX_VECTOR_SIZE
@@ -64,6 +64,6 @@
 #endif  //  INSTRSET >= 9
 #endif  //  MAX_VECTOR_SIZE >= 512
 
-#endif  // INSTRSET < 2 
+#endif  // INSTRSET < 2
 
 #endif  // VECTORCLASS_H
diff --git a/vectorclass/vectorclass.pdf b/vectorclass/vectorclass.pdf
index 91e66c8..498385a 100755
Binary files a/vectorclass/vectorclass.pdf and b/vectorclass/vectorclass.pdf differ
diff --git a/vectorclass/vectorf128.h b/vectorclass/vectorf128.h
index 86fca47..defefce 100755
--- a/vectorclass/vectorf128.h
+++ b/vectorclass/vectorf128.h
@@ -1,8 +1,8 @@
 /****************************  vectorf128.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2015-11-27
-* Version:       1.20
+* Last modified: 2016-09-27
+* Version:       1.23
 * Project:       vector classes
 * Description:
 * Header file defining floating point vector classes as interface to 
@@ -30,14 +30,22 @@
 *
 * For detailed instructions, see VectorClass.pdf
 *
-* (c) Copyright 2012 - 2015 GNU General Public License http://www.gnu.org/licenses
+* (c) Copyright 2012 - 2016 GNU General Public License http://www.gnu.org/licenses
 *****************************************************************************/
 #ifndef VECTORF128_H
 #define VECTORF128_H
 
-#include "vectori128.h"  // Define integer vectors
+#if defined _MSC_VER && _MSC_VER >= 1800
+// solve problem with ambiguous overloading of pow function in Microsoft math.h:
+// make sure math.h is included first rather than last
+#include <math.h>
+#endif 
 
+#include "vectori128.h"  // Define integer vectors
 
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
 
 /*****************************************************************************
 *
@@ -131,11 +139,11 @@ public:
     operator __m128() const {
         return xmm;
     }
-#if defined (__clang__) && CLANG_VERSION < 30900 || defined(__apple_build_version__)
-#define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY  // clang 3.3 - 3.5 has silent conversion between intrinsic vector types. 
-                                          // I expected this to be fixed in version 3.4 but it still exists!
+#if defined (__clang__) /* && CLANG_VERSION < xxxxx */ || defined(__apple_build_version__)
+#define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY  // clang 3.3 has silent conversion between intrinsic vector types. 
+                                          // I expected this to be fixed in version 3.4 but it still exists in version 3.9!
                                           // http://llvm.org/bugs/show_bug.cgi?id=17164
-                                          // Problem: The version number is not consistent across platforms
+                                          // Additional problem: The version number is not consistent across platforms
                                           // The Apple build has different version numbers. Too bad!
                                           // http://llvm.org/bugs/show_bug.cgi?id=12643
 
@@ -246,12 +254,14 @@ static inline Vec4fb andnot(Vec4fb const & a, Vec4fb const & b) {
 
 // horizontal_and. Returns true if all bits are 1
 static inline bool horizontal_and (Vec4fb const & a) {
-    return horizontal_and(Vec128b(_mm_castps_si128(a)));
+    return _mm_movemask_ps(a) == 0x0F; 
+    //return horizontal_and(Vec128b(_mm_castps_si128(a)));
 }
 
 // horizontal_or. Returns true if at least one bit is 1
 static inline bool horizontal_or (Vec4fb const & a) {
-    return horizontal_or(Vec128b(_mm_castps_si128(a)));
+    return _mm_movemask_ps(a) != 0;
+    //return horizontal_or(Vec128b(_mm_castps_si128(a)));
 }
 
 
@@ -414,12 +424,14 @@ static inline Vec2db andnot(Vec2db const & a, Vec2db const & b) {
 
 // horizontal_and. Returns true if all bits are 1
 static inline bool horizontal_and (Vec2db const & a) {
-    return horizontal_and(Vec128b(_mm_castpd_si128(a)));
+    return _mm_movemask_pd(a) == 3;
+    //return horizontal_and(Vec128b(_mm_castpd_si128(a)));
 }
 
 // horizontal_or. Returns true if at least one bit is 1
 static inline bool horizontal_or (Vec2db const & a) {
-    return horizontal_or(Vec128b(_mm_castpd_si128(a)));
+    return _mm_movemask_pd(a) != 0;
+    //return horizontal_or(Vec128b(_mm_castpd_si128(a)));
 }
 
 
@@ -491,9 +503,9 @@ public:
         case 1:
             xmm = _mm_load_ss(p); break;
         case 2:
-            xmm = _mm_castpd_ps(_mm_load_sd((double*)p)); break;
+            xmm = _mm_castpd_ps(_mm_load_sd((double const*)p)); break;
         case 3:
-            t1 = _mm_castpd_ps(_mm_load_sd((double*)p));
+            t1 = _mm_castpd_ps(_mm_load_sd((double const*)p));
             t2 = _mm_load_ss(p + 2);
             xmm = _mm_movelh_ps(t1, t2); break;
         case 4:
@@ -1894,7 +1906,7 @@ static inline Vec2d pow(Vec2d const & a, Const_int_t<n>) {
 
 // avoid unsafe optimization in function round
 #if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) && INSTRSET < 5
-static inline Vec4f round(Vec4f const & a) __attribute__ ((optimize("-fno-unsafe-math-optimizations")));
+static inline Vec2d round(Vec2d const & a) __attribute__ ((optimize("-fno-unsafe-math-optimizations")));
 #elif defined (FLOAT_CONTROL_PRECISE_FOR_ROUND)
 #pragma float_control(push) 
 #pragma float_control(precise,on)
@@ -2618,4 +2630,8 @@ static inline Vec2db to_Vec2db(uint8_t x) {
     return Vec2db(to_Vec2qb(x));
 }
 
+#ifdef VCL_NAMESPACE
+}
+#endif
+
 #endif // VECTORF128_H
diff --git a/vectorclass/vectorf256.h b/vectorclass/vectorf256.h
index dc126a0..148442a 100755
--- a/vectorclass/vectorf256.h
+++ b/vectorclass/vectorf256.h
@@ -1,8 +1,8 @@
 /****************************  vectorf256.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2015-11-27
-* Version:       1.20
+* Last modified: 2016-04-26
+* Version:       1.22
 * Project:       vector classes
 * Description:
 * Header file defining 256-bit floating point vector classes as interface
@@ -27,7 +27,7 @@
 *
 * For detailed instructions, see VectorClass.pdf
 *
-* (c) Copyright 2012 - 2015 GNU General Public License http://www.gnu.org/licenses
+* (c) Copyright 2012 - 2016 GNU General Public License http://www.gnu.org/licenses
 *****************************************************************************/
 
 // check combination of header files
@@ -44,7 +44,9 @@
 
 #include "vectorf128.h"  // Define 128-bit vectors
 
-
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
 
 /*****************************************************************************
 *
@@ -3163,4 +3165,8 @@ static inline Vec4db to_Vec4db(uint8_t x) {
     return Vec4db(to_Vec4qb(x));
 }
 
+#ifdef VCL_NAMESPACE
+}
+#endif
+
 #endif // VECTORF256_H
diff --git a/vectorclass/vectorf256e.h b/vectorclass/vectorf256e.h
index 39c4410..386da1b 100755
--- a/vectorclass/vectorf256e.h
+++ b/vectorclass/vectorf256e.h
@@ -1,8 +1,8 @@
 /****************************  vectorf256e.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2015-08-25
-* Version:       1.18
+* Last modified: 2016-04-26
+* Version:       1.22
 * Project:       vector classes
 * Description:
 * Header file defining 256-bit floating point vector classes as interface
@@ -16,7 +16,7 @@
 *
 * For detailed instructions, see VectorClass.pdf
 *
-* (c) Copyright 2012 - 2015 GNU General Public License http://www.gnu.org/licenses
+* (c) Copyright 2012 - 2016 GNU General Public License http://www.gnu.org/licenses
 *****************************************************************************/
 
 // check combination of header files
@@ -34,6 +34,9 @@
 
 #include "vectorf128.h"  // Define 128-bit vectors
 
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
 
 /*****************************************************************************
 *
@@ -2066,4 +2069,8 @@ static inline Vec4db to_Vec4db(uint8_t x) {
     return Vec4db(to_Vec4qb(x));
 }
 
+#ifdef VCL_NAMESPACE
+}
+#endif
+
 #endif // VECTORF256_H
diff --git a/vectorclass/vectorf512.h b/vectorclass/vectorf512.h
index 5fab837..cfd16f3 100755
--- a/vectorclass/vectorf512.h
+++ b/vectorclass/vectorf512.h
@@ -1,8 +1,8 @@
 /****************************  vectorf512.h   *******************************
 * Author:        Agner Fog
 * Date created:  2014-07-23
-* Last modified: 2015-11-27
-* Version:       1.20
+* Last modified: 2016-04-26
+* Version:       1.22
 * Project:       vector classes
 * Description:
 * Header file defining floating point vector classes as interface to intrinsic 
@@ -23,7 +23,7 @@
 *
 * For detailed instructions, see VectorClass.pdf
 *
-* (c) Copyright 2015 GNU General Public License http://www.gnu.org/licenses
+* (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses
 *****************************************************************************/
 
 // check combination of header files
@@ -36,6 +36,10 @@
 
 #include "vectori512.h"
 
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
 // Define missing intrinsic functions
 #if defined (GCC_VERSION) && GCC_VERSION < 41102 && !defined(__INTEL_COMPILER) && !defined(__clang__)
 
@@ -787,12 +791,12 @@ static inline Vec16f ceil(Vec16f const & a) {
 
 // function round_to_int: round to nearest integer (even). (result as integer vector)
 static inline Vec16i round_to_int(Vec16f const & a) {
-    return _mm512_cvt_roundps_epi32(a, _MM_FROUND_NO_EXC);
+    return _mm512_cvt_roundps_epi32(a, 0+8 /*_MM_FROUND_NO_EXC*/);
 }
 
 // function truncate_to_int: round towards zero. (result as integer vector)
 static inline Vec16i truncate_to_int(Vec16f const & a) {
-    return _mm512_cvtt_roundps_epi32(a, _MM_FROUND_NO_EXC);
+    return _mm512_cvtt_roundps_epi32(a, 0+8 /*_MM_FROUND_NO_EXC*/);
 }
 
 // function to_float: convert integer vector to float vector
@@ -1438,7 +1442,7 @@ static inline Vec8d ceil(Vec8d const & a) {
 // function round_to_int: round to nearest integer (even). (result as integer vector)
 static inline Vec8i round_to_int(Vec8d const & a) {
     //return _mm512_cvtpd_epi32(a);
-    return _mm512_cvt_roundpd_epi32(a, __MM_FROUND_NO_EXC);
+    return _mm512_cvt_roundpd_epi32(a, 0+8);
 }
 
 // function truncate_to_int: round towards zero. (result as integer vector)
@@ -1473,7 +1477,7 @@ static inline Vec8q round_to_int64(Vec8d const & a) {
 // result as 64-bit integer vector, but with limited range
 static inline Vec8q round_to_int64_limited(Vec8d const & a) {
     //Vec4q   b = _mm512_cvtpd_epi32(a);                             // round to 32-bit integers
-    Vec4q   b = _mm512_cvt_roundpd_epi32(a, __MM_FROUND_NO_EXC);     // round to 32-bit integers   
+    Vec4q   b = _mm512_cvt_roundpd_epi32(a, 0+8);     // round to 32-bit integers   
     __m512i c = permute8q<0,-256,1,-256,2,-256,3,-256>(Vec8q(b,b));  // get bits 64-127 to position 128-191, etc.
     __m512i s = _mm512_srai_epi32(c, 31);                            // sign extension bits
     return      _mm512_unpacklo_epi32(c, s);                         // interleave with sign extensions
@@ -2362,4 +2366,8 @@ static inline Vec8db to_Vec8db(uint8_t x) {
     return Vec8db(to_Vec8qb(x));
 }
 
+#ifdef VCL_NAMESPACE
+}
+#endif
+
 #endif // VECTORF512_H
diff --git a/vectorclass/vectorf512e.h b/vectorclass/vectorf512e.h
index a0077b3..77ad9ca 100755
--- a/vectorclass/vectorf512e.h
+++ b/vectorclass/vectorf512e.h
@@ -1,8 +1,8 @@
 /****************************  vectorf512.h   *******************************
 * Author:        Agner Fog
 * Date created:  2014-07-23
-* Last modified: 2014-10-22
-* Version:       1.16
+* Last modified: 2016-04-26
+* Version:       1.22
 * Project:       vector classes
 * Description:
 * Header file defining floating point vector classes as interface to intrinsic 
@@ -23,7 +23,7 @@
 *
 * For detailed instructions, see VectorClass.pdf
 *
-* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
+* (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses
 *****************************************************************************/
 
 // check combination of header files
@@ -36,6 +36,9 @@
 
 #include "vectori512e.h"
 
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
 
 /*****************************************************************************
 *
@@ -2124,4 +2127,8 @@ static inline Vec8db to_Vec8db(uint8_t x) {
     return Vec8db(to_Vec8qb(x));
 }
 
+#ifdef VCL_NAMESPACE
+}
+#endif
+
 #endif // VECTORF512_H
diff --git a/vectorclass/vectori128.h b/vectorclass/vectori128.h
index 73f53d5..76d29fb 100755
--- a/vectorclass/vectori128.h
+++ b/vectorclass/vectori128.h
@@ -1,8 +1,8 @@
 /****************************  vectori128.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2015-11-07
-* Version:       1.19
+* Last modified: 2016-04-26
+* Version:       1.22
 * Project:       vector classes
 * Description:
 * Header file defining integer vector classes as interface to intrinsic 
@@ -39,7 +39,7 @@
 *
 * For detailed instructions, see VectorClass.pdf
 *
-* (c) Copyright 2012 - 2015 GNU General Public License http://www.gnu.org/licenses
+* (c) Copyright 2012 - 2016 GNU General Public License http://www.gnu.org/licenses
 *****************************************************************************/
 #ifndef VECTORI128_H
 #define VECTORI128_H
@@ -50,7 +50,9 @@
 #error Please compile for the SSE2 instruction set or higher
 #endif
 
-
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
 
 /*****************************************************************************
 *
@@ -344,7 +346,7 @@ public:
         else {
             // worst case. read 1 byte at a time and suffer store forwarding penalty
             char x[16];
-            for (int i = 0; i < n; i++) x[i] = ((char *)p)[i];
+            for (int i = 0; i < n; i++) x[i] = ((char const *)p)[i];
             load(x);
         }
         cutoff(n);
@@ -526,6 +528,22 @@ static inline Vec16cb andnot (Vec16cb const & a, Vec16cb const & b) {
     return Vec16cb(andnot(Vec128b(a), Vec128b(b)));
 }
 
+// Horizontal Boolean functions for Vec16cb
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec16cb const & a) {
+    return _mm_movemask_epi8(a) == 0xFFFF;
+}
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec16cb const & a) {
+#if INSTRSET >= 5   // SSE4.1 supported. Use PTEST
+    return !_mm_testz_si128(a, a);
+#else
+    return _mm_movemask_epi8(a) != 0;
+#endif
+} 
+
 
 /*****************************************************************************
 *
@@ -643,7 +661,7 @@ static inline Vec16cb operator == (Vec16c const & a, Vec16c const & b) {
 // vector operator != : returns true for elements for which a != b
 static inline Vec16cb operator != (Vec16c const & a, Vec16c const & b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return _mm_comneq_epi8(a,b);
+    return (Vec16cb)_mm_comneq_epi8(a,b);
 #else  // SSE2 instruction set
     return Vec16cb(Vec16c(~(a == b)));
 #endif
@@ -662,7 +680,7 @@ static inline Vec16cb operator < (Vec16c const & a, Vec16c const & b) {
 // vector operator >= : returns true for elements for which a >= b (signed)
 static inline Vec16cb operator >= (Vec16c const & a, Vec16c const & b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return _mm_comge_epi8(a,b);
+    return (Vec16cb)_mm_comge_epi8(a,b);
 #else  // SSE2 instruction set
     return Vec16cb(Vec16c(~(b > a)));
 #endif
@@ -947,9 +965,9 @@ static inline Vec16uc & operator >>= (Vec16uc & a, int b) {
 // vector operator >= : returns true for elements for which a >= b (unsigned)
 static inline Vec16cb operator >= (Vec16uc const & a, Vec16uc const & b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return _mm_comge_epu8(a,b);
+    return (Vec16cb)_mm_comge_epu8(a,b);
 #else  // SSE2 instruction set
-    return _mm_cmpeq_epi8(_mm_max_epu8(a,b),a); // a == max(a,b)
+    return (Vec16cb)_mm_cmpeq_epi8(_mm_max_epu8(a,b),a); // a == max(a,b)
 #endif
 }
 
@@ -961,7 +979,7 @@ static inline Vec16cb operator <= (Vec16uc const & a, Vec16uc const & b) {
 // vector operator > : returns true for elements for which a > b (unsigned)
 static inline Vec16cb operator > (Vec16uc const & a, Vec16uc const & b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return _mm_comgt_epu8(a,b);
+    return (Vec16cb)_mm_comgt_epu8(a,b);
 #else  // SSE2 instruction set
     return Vec16cb(Vec16c(~(b >= a)));
 #endif
@@ -1123,7 +1141,7 @@ public:
         else {
             // worst case. read 1 byte at a time and suffer store forwarding penalty
             int16_t x[8];
-            for (int i = 0; i < n; i++) x[i] = ((int16_t *)p)[i];
+            for (int i = 0; i < n; i++) x[i] = ((int16_t const *)p)[i];
             load(x);
         }
         cutoff(n);
@@ -1328,6 +1346,22 @@ static inline Vec8sb andnot (Vec8sb const & a, Vec8sb const & b) {
     return Vec8sb(andnot(Vec128b(a), Vec128b(b)));
 }
 
+// Horizontal Boolean functions for Vec8sb
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec8sb const & a) {
+    return _mm_movemask_epi8(a) == 0xFFFF;
+}
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec8sb const & a) {
+#if INSTRSET >= 5   // SSE4.1 supported. Use PTEST
+    return !_mm_testz_si128(a, a);
+#else
+    return _mm_movemask_epi8(a) != 0;
+#endif
+}
+
 
 /*****************************************************************************
 *
@@ -1433,7 +1467,7 @@ static inline Vec8sb operator == (Vec8s const & a, Vec8s const & b) {
 // vector operator != : returns true for elements for which a != b
 static inline Vec8sb operator != (Vec8s const & a, Vec8s const & b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return _mm_comneq_epi16(a,b);
+    return (Vec8sb)_mm_comneq_epi16(a,b);
 #else  // SSE2 instruction set
     return Vec8sb (~(a == b));
 #endif
@@ -1452,7 +1486,7 @@ static inline Vec8sb operator < (Vec8s const & a, Vec8s const & b) {
 // vector operator >= : returns true for elements for which a >= b (signed)
 static inline Vec8sb operator >= (Vec8s const & a, Vec8s const & b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return _mm_comge_epi16(a,b);
+    return (Vec8sb)_mm_comge_epi16(a,b);
 #else  // SSE2 instruction set
     return Vec8sb (~(b > a));
 #endif
@@ -1753,7 +1787,7 @@ static inline Vec8s operator <= (Vec8us const & a, Vec8us const & b) {
 // vector operator > : returns true for elements for which a > b (unsigned)
 static inline Vec8s operator > (Vec8us const & a, Vec8us const & b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return _mm_comgt_epu16(a,b);
+    return (Vec8s)_mm_comgt_epu16(a,b);
 #else  // SSE2 instruction set
     return Vec8s (~(b >= a));
 #endif
@@ -1947,12 +1981,12 @@ public:
         case 0:
             *this = 0;  break;
         case 1:
-            xmm = _mm_cvtsi32_si128(*(int32_t*)p);  break;
+            xmm = _mm_cvtsi32_si128(*(int32_t const*)p);  break;
         case 2:
             // intrinsic for movq is missing!
-            xmm = _mm_setr_epi32(((int32_t*)p)[0], ((int32_t*)p)[1], 0, 0);  break;
+            xmm = _mm_setr_epi32(((int32_t const*)p)[0], ((int32_t const*)p)[1], 0, 0);  break;
         case 3:
-            xmm = _mm_setr_epi32(((int32_t*)p)[0], ((int32_t*)p)[1], ((int32_t*)p)[2], 0);  break;
+            xmm = _mm_setr_epi32(((int32_t const*)p)[0], ((int32_t const*)p)[1], ((int32_t const*)p)[2], 0);  break;
         case 4:
             load(p);  break;
         default: 
@@ -2122,6 +2156,22 @@ static inline Vec4ib andnot (Vec4ib const & a, Vec4ib const & b) {
     return Vec4ib(andnot(Vec128b(a), Vec128b(b)));
 }
 
+// Horizontal Boolean functions for Vec4ib
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec4ib const & a) {
+    return _mm_movemask_epi8(a) == 0xFFFF;
+}
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec4ib const & a) {
+#if INSTRSET >= 5   // SSE4.1 supported. Use PTEST
+    return !_mm_testz_si128(a, a);
+#else
+    return _mm_movemask_epi8(a) != 0;
+#endif
+}
+
 
 /*****************************************************************************
 *
@@ -2237,7 +2287,7 @@ static inline Vec4ib operator == (Vec4i const & a, Vec4i const & b) {
 // vector operator != : returns true for elements for which a != b
 static inline Vec4ib operator != (Vec4i const & a, Vec4i const & b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return _mm_comneq_epi32(a,b);
+    return (Vec4ib)_mm_comneq_epi32(a,b);
 #else  // SSE2 instruction set
     return Vec4ib(Vec4i (~(a == b)));
 #endif
@@ -2256,7 +2306,7 @@ static inline Vec4ib operator < (Vec4i const & a, Vec4i const & b) {
 // vector operator >= : returns true for elements for which a >= b (signed)
 static inline Vec4ib operator >= (Vec4i const & a, Vec4i const & b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return _mm_comge_epi32(a,b);
+    return (Vec4ib)_mm_comge_epi32(a,b);
 #else  // SSE2 instruction set
     return Vec4ib(Vec4i (~(b > a)));
 #endif
@@ -2556,12 +2606,12 @@ static inline Vec4ui operator << (Vec4ui const & a, int32_t b) {
 // vector operator > : returns true for elements for which a > b (unsigned)
 static inline Vec4ib operator > (Vec4ui const & a, Vec4ui const & b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return _mm_comgt_epu32(a,b);
+    return (Vec4ib)_mm_comgt_epu32(a,b);
 #else  // SSE2 instruction set
     __m128i signbit = _mm_set1_epi32(0x80000000);
     __m128i a1      = _mm_xor_si128(a,signbit);
     __m128i b1      = _mm_xor_si128(b,signbit);
-    return _mm_cmpgt_epi32(a1,b1);                         // signed compare
+    return (Vec4ib)_mm_cmpgt_epi32(a1,b1);                         // signed compare
 #endif
 }
 
@@ -2573,10 +2623,10 @@ static inline Vec4ib operator < (Vec4ui const & a, Vec4ui const & b) {
 // vector operator >= : returns true for elements for which a >= b (unsigned)
 static inline Vec4ib operator >= (Vec4ui const & a, Vec4ui const & b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return _mm_comge_epu32(a,b);
+    return (Vec4ib)_mm_comge_epu32(a,b);
 #elif INSTRSET >= 5   // SSE4.1
     __m128i max_ab = _mm_max_epu32(a,b);                   // max(a,b), unsigned
-    return _mm_cmpeq_epi32(a,max_ab);                      // a == max(a,b)
+    return (Vec4ib)_mm_cmpeq_epi32(a,max_ab);                      // a == max(a,b)
 #else  // SSE2 instruction set
     return Vec4ib(Vec4i (~(b > a)));
 #endif
@@ -2706,7 +2756,7 @@ public:
     }
     // Constructor to broadcast the same value into all elements:
     Vec2q(int64_t i) {
-#if defined (_MSC_VER) && ! defined(__INTEL_COMPILER)
+#if defined (_MSC_VER) && _MSC_VER < 1900 && ! defined(__INTEL_COMPILER)
         // MS compiler has no _mm_set1_epi64x in 32 bit mode
 #if defined(__x86_64__)                                    // 64 bit mode
 #if _MSC_VER < 1700
@@ -2733,12 +2783,12 @@ public:
 
 #endif  // __x86_64__
 #else   // Other compilers
-        xmm = _mm_set1_epi64x(i);   // emmintrin.h
+        xmm = _mm_set1_epi64x(i);
 #endif
     }
     // Constructor to build from all elements:
     Vec2q(int64_t i0, int64_t i1) {
-#if defined (_MSC_VER) && ! defined(__INTEL_COMPILER)
+#if defined (_MSC_VER)  && _MSC_VER < 1900 && ! defined(__INTEL_COMPILER)
         // MS compiler has no _mm_set_epi64x in 32 bit mode
 #if defined(__x86_64__)                                    // 64 bit mode
 #if _MSC_VER < 1700
@@ -2791,7 +2841,7 @@ public:
             *this = 0;  break;
         case 1:
             // intrinsic for movq is missing!
-            *this = Vec2q(*(int64_t*)p, 0);  break;
+            *this = Vec2q(*(int64_t const*)p, 0);  break;
         case 2:
             load(p);  break;
         default: 
@@ -2974,6 +3024,22 @@ static inline Vec2qb andnot (Vec2qb const & a, Vec2qb const & b) {
     return Vec2qb(andnot(Vec128b(a), Vec128b(b)));
 }
 
+// Horizontal Boolean functions for Vec2qb
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec2qb const & a) {
+    return _mm_movemask_epi8(a) == 0xFFFF;
+}
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec2qb const & a) {
+#if INSTRSET >= 5   // SSE4.1 supported. Use PTEST
+    return !_mm_testz_si128(a, a);
+#else
+    return _mm_movemask_epi8(a) != 0;
+#endif
+} 
+
 
 /*****************************************************************************
 *
@@ -3115,7 +3181,7 @@ static inline Vec2qb operator == (Vec2q const & a, Vec2q const & b) {
 // vector operator != : returns true for elements for which a != b
 static inline Vec2qb operator != (Vec2q const & a, Vec2q const & b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return Vec2q(_mm_comneq_epi64(a,b));
+    return Vec2qb(_mm_comneq_epi64(a,b));
 #else  // SSE2 instruction set
     return Vec2qb(Vec2q(~(a == b)));
 #endif
@@ -3148,7 +3214,7 @@ static inline Vec2qb operator > (Vec2q const & a, Vec2q const & b) {
 // vector operator >= : returns true for elements for which a >= b (signed)
 static inline Vec2qb operator >= (Vec2q const & a, Vec2q const & b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return Vec2q(_mm_comge_epi64(a,b));
+    return Vec2qb(_mm_comge_epi64(a,b));
 #else  // SSE2 instruction set
     return Vec2qb(Vec2q(~(a < b)));
 #endif
@@ -3275,12 +3341,12 @@ static inline Vec2q abs_saturated(Vec2q const & a) {
 // Use negative count to rotate right
 static inline Vec2q rotate_left(Vec2q const & a, int b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return _mm_rot_epi64(a,Vec2q(b));
+    return (Vec2q)_mm_rot_epi64(a,Vec2q(b));
 #else  // SSE2 instruction set
     __m128i left  = _mm_sll_epi64(a,_mm_cvtsi32_si128(b & 0x3F));      // a << b 
     __m128i right = _mm_srl_epi64(a,_mm_cvtsi32_si128((64-b) & 0x3F)); // a >> (64 - b)
     __m128i rot   = _mm_or_si128(left,right);                          // or
-    return  rot;
+    return  (Vec2q)rot;
 #endif
 }
 
@@ -3386,7 +3452,7 @@ static inline Vec2uq operator << (Vec2uq const & a, int32_t b) {
 // vector operator > : returns true for elements for which a > b (unsigned)
 static inline Vec2qb operator > (Vec2uq const & a, Vec2uq const & b) {
 #if defined ( __XOP__ ) // AMD XOP instruction set
-    return Vec2q(_mm_comgt_epu64(a,b));
+    return Vec2qb(_mm_comgt_epu64(a,b));
 #elif INSTRSET >= 6 // SSE4.2
     __m128i sign64 = constant4i<0,(int32_t)0x80000000,0,(int32_t)0x80000000>();
     __m128i aflip  = _mm_xor_si128(a, sign64);
@@ -3415,7 +3481,7 @@ static inline Vec2qb operator < (Vec2uq const & a, Vec2uq const & b) {
 // vector operator >= : returns true for elements for which a >= b (unsigned)
 static inline Vec2qb operator >= (Vec2uq const & a, Vec2uq const & b) {
 #ifdef __XOP__  // AMD XOP instruction set
-    return Vec2q(_mm_comge_epu64(a,b));
+    return Vec2qb(_mm_comge_epu64(a,b));
 #else  // SSE2 instruction set
     return  Vec2qb(Vec2q(~(b > a)));
 #endif
@@ -5374,7 +5440,7 @@ public:
         sign       = _mm_set1_epi32(sgn);
     }
     void set(int32_t d) {                                  // Set or change divisor, calculate parameters
-        const int32_t d1 = abs(d);
+        const int32_t d1 = ::abs(d);
         int32_t sh, m;
         if (d1 > 1) {
             sh = bit_scan_reverse(d1-1);                   // shift count = ceil(log2(d1))-1 = (bit_scan_reverse(d1-1)+1)-1
@@ -5471,7 +5537,7 @@ public:
         sign       = _mm_set1_epi32(sgn);
     }
     void set(int16_t d) {                                  // Set or change divisor, calculate parameters
-        const int32_t d1 = abs(d);
+        const int32_t d1 = ::abs(d);
         int32_t sh, m;
         if (d1 > 1) {
             sh = bit_scan_reverse(d1-1);                   // shift count = ceil(log2(d1))-1 = (bit_scan_reverse(d1-1)+1)-1
@@ -6148,4 +6214,8 @@ static inline Vec2qb to_Vec2qb(uint8_t x);
 
 #endif  // INSTRSET < 9 || MAX_VECTOR_SIZE < 512
 
+#ifdef VCL_NAMESPACE
+}
+#endif
+
 #endif // VECTORI128_H
diff --git a/vectorclass/vectori256.h b/vectorclass/vectori256.h
index 47da1ea..890212c 100755
--- a/vectorclass/vectori256.h
+++ b/vectorclass/vectori256.h
@@ -1,8 +1,8 @@
 /****************************  vectori256.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2015-11-08
-* Version:       1.19
+* Last modified: 2016-04-26
+* Version:       1.22
 * Project:       vector classes
 * Description:
 * Header file defining integer vector classes as interface to intrinsic 
@@ -36,7 +36,7 @@
 *
 * For detailed instructions, see VectorClass.pdf
 *
-* (c) Copyright 2012 - 2015 GNU General Public License http://www.gnu.org/licenses
+* (c) Copyright 2012 - 2016 GNU General Public License http://www.gnu.org/licenses
 *****************************************************************************/
 
 // check combination of header files
@@ -58,6 +58,9 @@
 
 #include "vectori128.h"
 
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
 
 /*****************************************************************************
 *
@@ -334,7 +337,7 @@ public:
             *this = Vec32c(Vec16c().load_partial(n, p), 0);
         }
         else if (n < 32) {
-            *this = Vec32c(Vec16c().load(p), Vec16c().load_partial(n-16, (char*)p+16));
+            *this = Vec32c(Vec16c().load(p), Vec16c().load_partial(n-16, (char const*)p+16));
         }
         else {
             load(p);
@@ -1104,7 +1107,7 @@ public:
             *this = Vec16s(Vec8s().load_partial(n, p), 0);
         }
         else if (n < 16) {
-            *this = Vec16s(Vec8s().load(p), Vec8s().load_partial(n-8, (int16_t*)p+8));
+            *this = Vec16s(Vec8s().load(p), Vec8s().load_partial(n-8, (int16_t const*)p+8));
         }
         else {
             load(p);
@@ -1829,7 +1832,7 @@ public:
             *this = Vec8i(Vec4i().load_partial(n, p), 0);
         }
         else if (n < 8) {
-            *this = Vec8i(Vec4i().load(p), Vec4i().load_partial(n-4, (int32_t*)p+4));
+            *this = Vec8i(Vec4i().load(p), Vec4i().load_partial(n-4, (int32_t const*)p+4));
         }
         else {
             load(p);
@@ -2495,7 +2498,7 @@ public:
     }
     // Constructor to broadcast the same value into all elements:
     Vec4q(int64_t i) {
-#if defined (_MSC_VER) && ! defined (__x86_64__) && ! defined(__INTEL_COMPILER)
+#if defined (_MSC_VER) && _MSC_VER < 1900 && ! defined (__x86_64__) && ! defined(__INTEL_COMPILER)
         // MS compiler cannot use _mm256_set1_epi64x in 32 bit mode, and  
         // cannot put 64-bit values into xmm register without using
         // mmx registers, and it makes no emms
@@ -2511,7 +2514,7 @@ public:
     }
     // Constructor to build from all elements:
     Vec4q(int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-#if defined (_MSC_VER) && ! defined (__x86_64__) && ! defined(__INTEL_COMPILER)
+#if defined (_MSC_VER) && _MSC_VER < 1900 && ! defined (__x86_64__) && ! defined(__INTEL_COMPILER)
         // MS compiler cannot put 64-bit values into xmm register without using
         // mmx registers, and it makes no emms
         union {
@@ -2560,7 +2563,7 @@ public:
             *this = Vec4q(Vec2q().load_partial(n, p), 0);
         }
         else if (n < 4) {
-            *this = Vec4q(Vec2q().load(p), Vec2q().load_partial(n-2, (int64_t*)p+2));
+            *this = Vec4q(Vec2q().load(p), Vec2q().load_partial(n-2, (int64_t const*)p+2));
         }
         else {
             load(p);
@@ -3688,7 +3691,7 @@ static inline Vec32c permute32c(Vec32c const & a) {
         && i8 ==((i0+8 )&31) && i9 ==((i0+9 )&31) && i10==((i0+10)&31) && i11==((i0+11)&31) && i12==((i0+12)&31) && i13==((i0+13)&31) && i14==((i0+14)&31) && i15==((i0+15)&31)
         && i16==((i0+16)&31) && i17==((i0+17)&31) && i18==((i0+18)&31) && i19==((i0+19)&31) && i20==((i0+20)&31) && i21==((i0+21)&31) && i22==((i0+22)&31) && i23==((i0+23)&31)
         && i24==((i0+24)&31) && i25==((i0+25)&31) && i26==((i0+26)&31) && i27==((i0+27)&31) && i28==((i0+28)&31) && i29==((i0+29)&31) && i30==((i0+30)&31) && i31==((i0+31)&31)) {
-        __m256i t1 = _mm256_permute4x64_epi64(a, 0x4E);
+        t1 = _mm256_permute4x64_epi64(a, 0x4E);
         return _mm256_alignr_epi8(a, t1, i0 & 15);
     }
 
@@ -4581,7 +4584,7 @@ static inline Vec8i lookup(Vec8i const & index, void const * table) {
     }
     if (n <= 16) {
         Vec8i table1 = Vec8i().load(table);
-        Vec8i table2 = Vec8i().load((int32_t*)table + 8);
+        Vec8i table2 = Vec8i().load((int32_t const*)table + 8);
         Vec8i y1 = lookup8(index, table1);
         Vec8i y2 = lookup8(index, table2);
         Vec8ib s = index > 7;
@@ -5508,5 +5511,8 @@ static inline Vec4qb to_Vec4qb(uint8_t x);
 
 #endif  // INSTRSET < 9 || MAX_VECTOR_SIZE < 512
 
+#ifdef VCL_NAMESPACE
+}
+#endif
 
 #endif // VECTORI256_H
diff --git a/vectorclass/vectori256e.h b/vectorclass/vectori256e.h
index 71d0ffb..507b886 100755
--- a/vectorclass/vectori256e.h
+++ b/vectorclass/vectori256e.h
@@ -1,8 +1,8 @@
 /****************************  vectori256e.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2014-10-16
-* Version:       1.16
+* Last modified: 2016-04-26
+* Version:       1.22
 * Project:       vector classes
 * Description:
 * Header file defining 256-bit integer point vector classes as interface
@@ -25,7 +25,7 @@
 *
 * For detailed instructions, see VectorClass.pdf
 *
-* (c) Copyright 2012 - 2014 GNU General Public License http://www.gnu.org/licenses
+* (c) Copyright 2012 - 2016 GNU General Public License http://www.gnu.org/licenses
 *****************************************************************************/
 
 // check combination of header files
@@ -43,6 +43,9 @@
 
 #include "vectori128.h"
 
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
 
 /*****************************************************************************
 *
@@ -325,7 +328,7 @@ public:
             *this = Vec32c(Vec16c().load_partial(n, p), 0);
         }
         else if (n < 32) {
-            *this = Vec32c(Vec16c().load(p), Vec16c().load_partial(n-16, (char*)p+16));
+            *this = Vec32c(Vec16c().load(p), Vec16c().load_partial(n-16, (char const*)p+16));
         }
         else {
             load(p);
@@ -1040,7 +1043,7 @@ public:
             *this = Vec16s(Vec8s().load_partial(n, p), 0);
         }
         else if (n < 16) {
-            *this = Vec16s(Vec8s().load(p), Vec8s().load_partial(n-8, (int16_t*)p+8));
+            *this = Vec16s(Vec8s().load(p), Vec8s().load_partial(n-8, (int16_t const*)p+8));
         }
         else {
             load(p);
@@ -1742,7 +1745,7 @@ public:
             *this = Vec8i(Vec4i().load_partial(n, p), 0);
         }
         else if (n < 8) {
-            *this = Vec8i(Vec4i().load(p), Vec4i().load_partial(n-4, (int32_t*)p+4));
+            *this = Vec8i(Vec4i().load(p), Vec4i().load_partial(n-4, (int32_t const*)p+4));
         }
         else {
             load(p);
@@ -2447,7 +2450,7 @@ public:
             *this = Vec4q(Vec2q().load_partial(n, p), 0);
         }
         else if (n < 4) {
-            *this = Vec4q(Vec2q().load(p), Vec2q().load_partial(n-2, (int64_t*)p+2));
+            *this = Vec4q(Vec2q().load(p), Vec2q().load_partial(n-2, (int64_t const*)p+2));
         }
         else {
             load(p);
@@ -4329,4 +4332,8 @@ static inline Vec4qb to_Vec4qb(uint8_t x) {
     return Vec4q(to_Vec2qb(x), to_Vec2qb(x>>2));
 }
 
+#ifdef VCL_NAMESPACE
+}
+#endif
+
 #endif // VECTORI256_H
diff --git a/vectorclass/vectori512.h b/vectorclass/vectori512.h
index dac51c3..156dfe7 100755
--- a/vectorclass/vectori512.h
+++ b/vectorclass/vectori512.h
@@ -1,8 +1,8 @@
 /****************************  vectori512.h   *******************************
 * Author:        Agner Fog
 * Date created:  2014-07-23
-* Last modified: 2014-10-16
-* Version:       1.16
+* Last modified: 2016-04-26
+* Version:       1.22
 * Project:       vector classes
 * Description:
 * Header file defining integer vector classes as interface to intrinsic 
@@ -25,7 +25,7 @@
 *
 * For detailed instructions, see VectorClass.pdf
 *
-* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
+* (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses
 *****************************************************************************/
 
 // check combination of header files
@@ -47,6 +47,9 @@
 
 #include "vectori256.h"
 
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
 
 // Bug fix for missing intrinsics:
 // _mm512_cmpgt_epu32_mask, _mm512_cmpgt_epu64_mask
@@ -1360,10 +1363,22 @@ static inline Vec8q & operator -- (Vec8q & a) {
 
 // vector operator * : multiply element by element
 static inline Vec8q operator * (Vec8q const & a, Vec8q const & b) {
-#if defined (GCC_VERSION) && GCC_VERSION < 41100 && !defined(__INTEL_COMPILER) && !defined(__clang__)
-    return Vec8q(a.get_low() * b.get_low(), a.get_high() * b.get_high());  // _mm512_mullox_epi64 missing in gcc 4.10.
+#if defined (__INTEL_COMPILER)
+    return _mm512_mullox_epi64(a, b);                      // _mm512_mullox_epi64 missing in gcc
 #else
-    return _mm512_mullox_epi64(a, b);
+    // return Vec8q(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+
+    // instruction does not exist. Split into 32-bit multiplies
+    //__m512i ahigh = _mm512_shuffle_epi32(a, 0xB1);       // swap H<->L
+    __m512i ahigh   = _mm512_srli_epi64(a, 32);            // high 32 bits of each a
+    __m512i bhigh   = _mm512_srli_epi64(b, 32);            // high 32 bits of each b
+    __m512i prodahb = _mm512_mul_epu32(ahigh, b);          // ahigh*b
+    __m512i prodbha = _mm512_mul_epu32(bhigh, a);          // bhigh*a
+    __m512i prodhl  = _mm512_add_epi64(prodahb, prodbha);  // sum of high*low products
+    __m512i prodhi  = _mm512_slli_epi64(prodhl, 32);       // same, shifted high
+    __m512i prodll  = _mm512_mul_epu32(a, b);              // alow*blow = 64 bit unsigned products
+    __m512i prod    = _mm512_add_epi64(prodll, prodhi);    // low*low+(high*low)<<32
+    return  prod;
 #endif
 }
 
@@ -2730,4 +2745,8 @@ static inline Vec8qb to_Vec8qb(uint8_t x) {
     return (__mmask8)x;
 }
 
+#ifdef VCL_NAMESPACE
+}
+#endif
+
 #endif // VECTORI512_H
diff --git a/vectorclass/vectori512e.h b/vectorclass/vectori512e.h
index de7dac6..1acacad 100755
--- a/vectorclass/vectori512e.h
+++ b/vectorclass/vectori512e.h
@@ -1,8 +1,8 @@
 /****************************  vectori512e.h   *******************************
 * Author:        Agner Fog
 * Date created:  2014-07-23
-* Last modified: 2014-10-16
-* Version:       1.16
+* Last modified: 2016-04-26
+* Version:       1.22
 * Project:       vector classes
 * Description:
 * Header file defining integer vector classes as interface to intrinsic 
@@ -25,7 +25,7 @@
 *
 * For detailed instructions, see VectorClass.pdf
 *
-* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
+* (c) Copyright 2014 - 2016 GNU General Public License http://www.gnu.org/licenses
 *****************************************************************************/
 
 // check combination of header files
@@ -36,6 +36,9 @@
 #else
 #define VECTORI512_H  1
 
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
 
 /*****************************************************************************
 *
@@ -87,13 +90,13 @@ public:
     // Member function to load from array (unaligned)
     Vec512b & load(void const * p) {
         z0 = Vec8i().load(p);
-        z1 = Vec8i().load((int32_t*)p+8);
+        z1 = Vec8i().load((int32_t const*)p+8);
         return *this;
     }
     // Member function to load from array, aligned by 64
     Vec512b & load_a(void const * p) {
         z0 = Vec8i().load_a(p);
-        z1 = Vec8i().load_a((int32_t*)p+8);
+        z1 = Vec8i().load_a((int32_t const*)p+8);
         return *this;
     }
     // Member function to store into array (unaligned)
@@ -729,7 +732,7 @@ public:
         }
         else {
             z0 = Vec8i().load(p);
-            z1 = Vec8i().load_partial(n - 8, (int32_t *)p + 8);
+            z1 = Vec8i().load_partial(n - 8, (int32_t const*)p + 8);
         }
         return *this;
     }
@@ -1250,13 +1253,13 @@ public:
     // Member function to load from array (unaligned)
     Vec8q & load(void const * p) {
         z0 = Vec4q().load(p);
-        z1 = Vec4q().load((int64_t*)p+4);
+        z1 = Vec4q().load((int64_t const*)p+4);
         return *this;
     }
     // Member function to load from array, aligned by 64
     Vec8q & load_a(void const * p) {
         z0 = Vec4q().load_a(p);
-        z1 = Vec4q().load_a((int64_t*)p+4);
+        z1 = Vec4q().load_a((int64_t const*)p+4);
         return *this;
     }
     // Partial load. Load n elements and set the rest to 0
@@ -1267,7 +1270,7 @@ public:
         }
         else {
             z0 = Vec4q().load(p);
-            z1 = Vec4q().load_partial(n - 4, (int64_t *)p + 4);
+            z1 = Vec4q().load_partial(n - 4, (int64_t const*)p + 4);
         }
         return *this;
     }
@@ -2542,4 +2545,8 @@ static inline Vec8qb to_Vec8qb(uint8_t x) {
     return Vec8q(to_Vec4qb(x), to_Vec4qb(x>>4));
 }
 
+#ifdef VCL_NAMESPACE
+}
+#endif
+
 #endif // VECTORI512_H
diff --git a/vectorclass/vectormath_common.h b/vectorclass/vectormath_common.h
index edcbd13..ee29469 100755
--- a/vectorclass/vectormath_common.h
+++ b/vectorclass/vectormath_common.h
@@ -1,8 +1,8 @@
 /***************************  vectormath_common.h   ****************************
 * Author:        Agner Fog
 * Date created:  2014-04-18
-* Last modified: 2014-10-16
-* Version:       1.16
+* Last modified: 2016-05-02
+* Version:       1.22
 * Project:       vector classes
 * Description:
 * Header file containing common code for inline version of mathematical functions.
@@ -21,7 +21,7 @@
 *
 * For detailed instructions, see VectorClass.pdf
 *
-* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
+* (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses
 ******************************************************************************/
 
 #ifndef VECTORMATH_COMMON_H
@@ -50,6 +50,9 @@
 #define VM_SMALLEST_NORMAL  2.2250738585072014E-308  // smallest normal number, double
 #define VM_SMALLEST_NORMALF 1.17549435E-38f          // smallest normal number, float
 
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
 
 /******************************************************************************
       templates for producing infinite and nan in desired vector type
@@ -151,7 +154,7 @@ longest dependency chains first.
 ******************************************************************************/
 
 // template <typedef VECTYPE, typedef CTYPE> 
-template <class VTYPE, class CTYPE> 
+template <class VTYPE, class CTYPE>
 static inline VTYPE polynomial_2(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2) {
     // calculates polynomial c2*x^2 + c1*x + c0
     // VTYPE may be a vector type, CTYPE is a scalar type
@@ -160,86 +163,86 @@ static inline VTYPE polynomial_2(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2)
     return mul_add(x2, c2, mul_add(x, c1, c0));
 }
 
-template<class VTYPE, class CTYPE> 
+template<class VTYPE, class CTYPE>
 static inline VTYPE polynomial_3(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) {
     // calculates polynomial c3*x^3 + c2*x^2 + c1*x + c0
     // VTYPE may be a vector type, CTYPE is a scalar type
     VTYPE x2 = x * x;
     //return (c2 + c3*x)*x2 + (c1*x + c0);
-    return mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0));
+    return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0));
 }
 
-template<class VTYPE, class CTYPE> 
+template<class VTYPE, class CTYPE>
 static inline VTYPE polynomial_4(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) {
     // calculates polynomial c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
     // VTYPE may be a vector type, CTYPE is a scalar type
     VTYPE x2 = x * x;
     VTYPE x4 = x2 * x2;
     //return (c2+c3*x)*x2 + ((c0+c1*x) + c4*x4);
-    return mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0) + c4*x4);
+    return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c4*x4);
 }
 
-template<class VTYPE, class CTYPE> 
+template<class VTYPE, class CTYPE>
 static inline VTYPE polynomial_4n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) {
     // calculates polynomial 1*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
     // VTYPE may be a vector type, CTYPE is a scalar type
     VTYPE x2 = x * x;
     VTYPE x4 = x2 * x2;
     //return (c2+c3*x)*x2 + ((c0+c1*x) + x4);
-    return mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0) + x4);
+    return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + x4);
 }
 
-template<class VTYPE, class CTYPE> 
+template<class VTYPE, class CTYPE>
 static inline VTYPE polynomial_5(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) {
     // calculates polynomial c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
     // VTYPE may be a vector type, CTYPE is a scalar type
     VTYPE x2 = x * x;
     VTYPE x4 = x2 * x2;
     //return (c2+c3*x)*x2 + ((c4+c5*x)*x4 + (c0+c1*x));
-    return mul_add(mul_add(c3,x,c2), x2, mul_add(mul_add(c5,x,c4), x4, mul_add(c1,x,c0)));
+    return mul_add(mul_add(c3, x, c2), x2, mul_add(mul_add(c5, x, c4), x4, mul_add(c1, x, c0)));
 }
 
-template<class VTYPE, class CTYPE> 
+template<class VTYPE, class CTYPE>
 static inline VTYPE polynomial_5n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) {
     // calculates polynomial 1*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
     // VTYPE may be a vector type, CTYPE is a scalar type
     VTYPE x2 = x * x;
     VTYPE x4 = x2 * x2;
     //return (c2+c3*x)*x2 + ((c4+x)*x4 + (c0+c1*x));
-    return mul_add( mul_add(c3,x,c2), x2, mul_add(c4+x,x4,mul_add(c1,x,c0)) );
+    return mul_add(mul_add(c3, x, c2), x2, mul_add(c4 + x, x4, mul_add(c1, x, c0)));
 }
 
-template<class VTYPE, class CTYPE> 
+template<class VTYPE, class CTYPE>
 static inline VTYPE polynomial_6(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6) {
     // calculates polynomial c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
     // VTYPE may be a vector type, CTYPE is a scalar type
     VTYPE x2 = x * x;
     VTYPE x4 = x2 * x2;
     //return  (c4+c5*x+c6*x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
-    return mul_add(mul_add(c6,x2,mul_add(c5,x,c4)), x4, mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0)));
+    return mul_add(mul_add(c6, x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
 }
 
-template<class VTYPE, class CTYPE> 
+template<class VTYPE, class CTYPE>
 static inline VTYPE polynomial_6n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) {
     // calculates polynomial 1*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
     // VTYPE may be a vector type, CTYPE is a scalar type
     VTYPE x2 = x * x;
     VTYPE x4 = x2 * x2;
     //return  (c4+c5*x+x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
-    return mul_add(mul_add(c5,x,c4+x2), x4, mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0)));
+    return mul_add(mul_add(c5, x, c4 + x2), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
 }
 
-template<class VTYPE, class CTYPE> 
+template<class VTYPE, class CTYPE>
 static inline VTYPE polynomial_7(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7) {
     // calculates polynomial c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
     // VTYPE may be a vector type, CTYPE is a scalar type
     VTYPE x2 = x * x;
     VTYPE x4 = x2 * x2;
     //return  ((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
-    return mul_add(mul_add(mul_add(c7,x,c6), x2, mul_add(c5,x,c4)), x4, mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0)));
+    return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
 }
 
-template<class VTYPE, class CTYPE> 
+template<class VTYPE, class CTYPE>
 static inline VTYPE polynomial_8(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8) {
     // calculates polynomial c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
     // VTYPE may be a vector type, CTYPE is a scalar type
@@ -247,11 +250,11 @@ static inline VTYPE polynomial_8(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2,
     VTYPE x4 = x2 * x2;
     VTYPE x8 = x4 * x4;
     //return  ((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8*x8 + (c2+c3*x)*x2 + (c0+c1*x));
-    return mul_add(mul_add(mul_add(c7,x,c6), x2, mul_add(c5,x,c4)), x4,
-           mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0)+c8*x8));
+    return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
+        mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c8*x8));
 }
 
-template<class VTYPE, class CTYPE> 
+template<class VTYPE, class CTYPE>
 static inline VTYPE polynomial_9(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9) {
     // calculates polynomial c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
     // VTYPE may be a vector type, CTYPE is a scalar type
@@ -259,12 +262,12 @@ static inline VTYPE polynomial_9(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2,
     VTYPE x4 = x2 * x2;
     VTYPE x8 = x4 * x4;
     //return  (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x)*x8) + ((c2+c3*x)*x2 + (c0+c1*x));
-    return mul_add(mul_add(c9,x,c8), x8, mul_add(
-        mul_add(mul_add(c7,x,c6), x2, mul_add(c5,x,c4)), x4,
-        mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0))));
+    return mul_add(mul_add(c9, x, c8), x8, mul_add(
+        mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
+        mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
 }
 
-template<class VTYPE, class CTYPE> 
+template<class VTYPE, class CTYPE>
 static inline VTYPE polynomial_10(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10) {
     // calculates polynomial c10*x^10 + c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
     // VTYPE may be a vector type, CTYPE is a scalar type
@@ -272,29 +275,29 @@ static inline VTYPE polynomial_10(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2,
     VTYPE x4 = x2 * x2;
     VTYPE x8 = x4 * x4;
     //return  (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x+c10*x2)*x8) + ((c2+c3*x)*x2 + (c0+c1*x));
-    return mul_add(mul_add(x2,c10,mul_add(c9,x,c8)), x8,
-                   mul_add(mul_add(mul_add(c7,x,c6),x2,mul_add(c5,x,c4)), x4,
-                           mul_add(mul_add(c3,x,c2),x2,mul_add(c1,x,c0))));
-} 
+    return mul_add(mul_add(x2, c10, mul_add(c9, x, c8)), x8,
+        mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
+            mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
+}
 
-template<class VTYPE, class CTYPE> 
+template<class VTYPE, class CTYPE>
 static inline VTYPE polynomial_13(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) {
     // calculates polynomial c13*x^13 + c12*x^12 + ... + c1*x + c0
     // VTYPE may be a vector type, CTYPE is a scalar type
     VTYPE x2 = x  * x;
     VTYPE x4 = x2 * x2;
     VTYPE x8 = x4 * x4;
-    return mul_add(        
-             mul_add(
-               mul_add(c13,x,c12), x4,
-                 mul_add(mul_add(c11,x,c10), x2, mul_add(c9,x,c8))), x8,
-             mul_add(
-               mul_add(mul_add(c7,x,c6), x2, mul_add(c5,x,c4)), x4,
-               mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0))));
+    return mul_add(
+        mul_add(
+            mul_add(c13, x, c12), x4,
+            mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8,
+        mul_add(
+            mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
+            mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
 }
 
 
-template<class VTYPE, class CTYPE> 
+template<class VTYPE, class CTYPE>
 static inline VTYPE polynomial_13m(VTYPE const & x, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) {
     // calculates polynomial c13*x^13 + c12*x^12 + ... + x + 0
     // VTYPE may be a vector type, CTYPE is a scalar type
@@ -303,8 +306,12 @@ static inline VTYPE polynomial_13m(VTYPE const & x, CTYPE c2, CTYPE c3, CTYPE c4
     VTYPE x8 = x4 * x4;
     // return  ((c8+c9*x) + (c10+c11*x)*x2 + (c12+c13*x)*x4)*x8 + (((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + x));
     return mul_add(
-        mul_add(mul_add(c13,x,c12), x4, mul_add(mul_add(c11,x,c10), x2, mul_add(c9,x,c8))), x8,
-        mul_add( mul_add(mul_add(c7,x,c6), x2, mul_add(c5,x,c4)), x4, mul_add(mul_add(c3,x,c2),x2,x)));
+        mul_add(mul_add(c13, x, c12), x4, mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8,
+        mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, x)));
 }
 
+#ifdef VCL_NAMESPACE
+}
+#endif
+
 #endif
diff --git a/vectorclass/vectormath_exp.h b/vectorclass/vectormath_exp.h
index 465ada8..66a90ea 100755
--- a/vectorclass/vectormath_exp.h
+++ b/vectorclass/vectormath_exp.h
@@ -1,8 +1,8 @@
 /****************************  vectormath_exp.h   ******************************
 * Author:        Agner Fog
 * Date created:  2014-04-18
-* Last modified: 2015-02-10
-* Version:       1.16
+* Last modified: 2016-04-26
+* Version:       1.22
 * Project:       vector classes
 * Description:
 * Header file containing inline vector functions of logarithms, exponential 
@@ -25,7 +25,7 @@
 *
 * For detailed instructions, see vectormath_common.h and VectorClass.pdf
 *
-* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
+* (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses
 ******************************************************************************/
 
 #ifndef VECTORMATH_EXP_H
@@ -33,6 +33,9 @@
 
 #include "vectormath_common.h"  
 
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
 
 /******************************************************************************
 *                 Exponential functions
@@ -1329,7 +1332,7 @@ static inline VTYPE pow_template_d(VTYPE const & x0, VTYPE const & y) {
     z = select(xfinite, z, select(y == 0., VTYPE(1.), select(y < 0., VTYPE(0.), infinite_vec<VTYPE>() | ( VTYPE(reinterpret_d(yodd)) & x0))));
     z = select(is_nan(x0), select(is_nan(y), x0 | y, x0), select(is_nan(y), y, z));
     return z;
-}; 
+}
 
 
 //This template is in vectorf128.h to prevent implicit conversion of float y to int when float version is not defined:
@@ -1615,7 +1618,7 @@ public:
         // but we can check a even anyway at no cost to be sure)
         if (a == 0) return 1.f;
         if ((b | ~a) & 1) y = abs(y);
-        y = ::pow(y, float(double(a)/double(b)));
+        y = pow(y, float(double(a)/double(b)));
         if (a & b & 1) y = sign_combine(y, x);          // apply sign if a and b both odd
         if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y); // zero allowed for positive a and b
         return y;
@@ -1624,7 +1627,7 @@ public:
         Vec2d y = x;
         if (a == 0) return 1.;
         if ((b | ~a) & 1) y = abs(y);
-        y = ::pow(y, double((long double)a/(long double)b));
+        y = pow(y, double((long double)a/(long double)b));
         if (a & b & 1) y = sign_combine(y, x);
         if ((a ^ b) >= 0) y = select(x == 0., 0., y);
         return y;
@@ -1634,7 +1637,7 @@ public:
         Vec8f y = x;
         if (a == 0) return 1.f;
         if ((b | ~a) & 1) y = abs(y);
-        y = ::pow(y, float(double(a)/double(b)));
+        y = pow(y, float(double(a)/double(b)));
         if (a & b & 1) y = sign_combine(y, x);
         if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y);
         return y;
@@ -1643,7 +1646,7 @@ public:
         Vec4d y = x;
         if (a == 0) return 1.;
         if ((b | ~a) & 1) y = abs(y);
-        y = ::pow(y, double((long double)a/(long double)b));
+        y = pow(y, double((long double)a/(long double)b));
         if (a & b & 1) y = sign_combine(y, x);
         if ((a ^ b) >= 0) y = select(x == 0., 0., y);
         return y;
@@ -1654,7 +1657,7 @@ public:
         Vec16f y = x;
         if (a == 0) return 1.f;
         if ((b | ~a) & 1) y = abs(y);
-        y = ::pow(y, float(double(a)/double(b)));
+        y = pow(y, float(double(a)/double(b)));
         if (a & b & 1) y = sign_combine(y, x);
         if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y);
         return y;
@@ -1663,7 +1666,7 @@ public:
         Vec8d y = x;
         if (a == 0) return 1.;
         if ((b | ~a) & 1) y = abs(y);
-        y = ::pow(y, double((long double)a/(long double)b));
+        y = pow(y, double((long double)a/(long double)b));
         if (a & b & 1) y = sign_combine(y, x);
         if ((a ^ b) >= 0) y = select(x == 0., 0., y);
         return y;
@@ -1992,4 +1995,8 @@ static inline Vec8q nan_code(Vec8d const & x) {
 
 #endif // MAX_VECTOR_SIZE >= 512
 
+#ifdef VCL_NAMESPACE
+}
+#endif
+
 #endif  // VECTORMATH_EXP_H
diff --git a/vectorclass/vectormath_hyp.h b/vectorclass/vectormath_hyp.h
index 948269b..adc8306 100755
--- a/vectorclass/vectormath_hyp.h
+++ b/vectorclass/vectormath_hyp.h
@@ -24,7 +24,7 @@
 *
 * For detailed instructions, see vectormath_common.h and VectorClass.pdf
 *
-* (c) Copyright 2015 GNU General Public License http://www.gnu.org/licenses
+* (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses
 ******************************************************************************/
 
 #ifndef VECTORMATH_HYP_H
@@ -32,6 +32,9 @@
 
 #include "vectormath_exp.h"  
 
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
 
 /******************************************************************************
 *                 Hyperbolic functions
@@ -733,4 +736,8 @@ static inline Vec16f atanh(Vec16f const & x) {
 }
 #endif // MAX_VECTOR_SIZE >= 512
 
+#ifdef VCL_NAMESPACE
+}
+#endif
+
 #endif
diff --git a/vectorclass/vectormath_lib.h b/vectorclass/vectormath_lib.h
index edea799..675769b 100755
--- a/vectorclass/vectormath_lib.h
+++ b/vectorclass/vectormath_lib.h
@@ -1,23 +1,23 @@
 /****************************  vectormath_lib.h   *****************************
-| Author:        Agner Fog
-| Date created:  2012-05-30
-* Last modified: 2014-04-23
-| Version:       1.16
-| Project:       vector classes
-| Description:
-| Header file defining mathematical functions on floating point vectors
-| May use Intel SVML library or AMD LIBM library
-|
-| Instructions:
-| Define VECTORMATH to one of the following values:
-|   0:  Use ordinary math library (slow)
-|   1:  Use AMD LIBM library
-|   2:  Use Intel SVML library with any compiler
-|   3:  Use Intel SVML library with Intel compiler
-|
-| For detailed instructions, see VectorClass.pdf
-|
-| (c) Copyright 2012-2014 GNU General Public License http://www.gnu.org/licenses
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2016-04-26
+* Version:       1.22
+* Project:       vector classes
+* Description:
+* Header file defining mathematical functions on floating point vectors
+* May use Intel SVML library or AMD LIBM library
+*
+* Instructions:
+* Define VECTORMATH to one of the following values:
+*   0:  Use ordinary math library (slow)
+*   1:  Use AMD LIBM library
+*   2:  Use Intel SVML library with any compiler
+*   3:  Use Intel SVML library with Intel compiler
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2012-2016 GNU General Public License http://www.gnu.org/licenses
 \*****************************************************************************/
 
 // check combination of header files
@@ -34,13 +34,18 @@
 #endif // __INTEL_COMPILER
 #endif // VECTORMATH
 
+#include <math.h>
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
 /*****************************************************************************
 *
 *      VECTORMATH = 0. Use ordinary library (scalar)
 *
 *****************************************************************************/
 #if VECTORMATH == 0
-#include <math.h>
 
 #ifndef VECTORMATH_COMMON_H
 // exponential and power functions
@@ -52,7 +57,7 @@ static inline Vec4f exp (Vec4f const & x) {
 static inline Vec2d exp (Vec2d const & x) {
     double xx[4];
     x.store(xx);
-    return Vec2d(exp(xx[0]), exp(xx[1]));
+    return Vec2d(::exp(xx[0]), ::exp(xx[1]));
 }
 
 // There is no certain way to know which functions are available, but at least some (Gnu)
@@ -105,30 +110,30 @@ static inline Vec4f pow (Vec4f const & a, Vec4f const & b) {
 static inline Vec2d pow (Vec2d const & a, Vec2d const & b) {
     double aa[4], bb[4];
     a.store(aa);  b.store(bb);
-    return Vec2d(pow(aa[0],bb[0]), pow(aa[1],bb[1]));
+    return Vec2d(::pow(aa[0],bb[0]), ::pow(aa[1],bb[1]));
 }
 
 static inline Vec4f log (Vec4f const & x) {
     float xx[4];
     x.store(xx);
-    return Vec4f(log(xx[0]), log(xx[1]), log(xx[2]), log(xx[3]));
+    return Vec4f(logf(xx[0]), logf(xx[1]), logf(xx[2]), logf(xx[3]));
 }
 static inline Vec2d log (Vec2d const & x) {
     double xx[4];
     x.store(xx);
-    return Vec2d(log(xx[0]), log(xx[1]));
+    return Vec2d(::log(xx[0]), ::log(xx[1]));
 }
 
 #ifdef HAVE_LOG1P
 static inline Vec4f log1p (Vec4f const & x) {
     float xx[4];
     x.store(xx);
-    return Vec4f(log1p(xx[0]), log1p(xx[1]), log1p(xx[2]), log1p(xx[3]));
+    return Vec4f(::log1p(xx[0]), ::log1p(xx[1]), ::log1p(xx[2]), ::log1p(xx[3]));
 }
 static inline Vec2d log1p (Vec2d const & x) {
     double xx[4];
     x.store(xx);
-    return Vec2d(log1p(xx[0]), log1p(xx[1]));
+    return Vec2d(::log1p(xx[0]), ::log1p(xx[1]));
 }
 #endif
 
@@ -147,7 +152,7 @@ static inline Vec4f log10 (Vec4f const & x) {  // logarithm base 10
 static inline Vec2d log10 (Vec2d const & x) {  // logarithm base 10
     double xx[4];
     x.store(xx);
-    return Vec2d(log10(xx[0]), log10(xx[1]));
+    return Vec2d(::log10(xx[0]), ::log10(xx[1]));
 }
 
 // trigonometric functions
@@ -159,7 +164,7 @@ static inline Vec4f sin(Vec4f const & x) {
 static inline Vec2d sin (Vec2d const & x) {
     double xx[4];
     x.store(xx);
-    return Vec2d(sin(xx[0]), sin(xx[1]));
+    return Vec2d(::sin(xx[0]), ::sin(xx[1]));
 }
 
 static inline Vec4f cos(Vec4f const & x) {
@@ -170,7 +175,7 @@ static inline Vec4f cos(Vec4f const & x) {
 static inline Vec2d cos (Vec2d const & x) {
     double xx[4];
     x.store(xx);
-    return Vec2d(cos(xx[0]), cos(xx[1]));
+    return Vec2d(::cos(xx[0]), ::cos(xx[1]));
 }
 
 static inline Vec4f sincos (Vec4f * pcos, Vec4f const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
@@ -190,7 +195,7 @@ static inline Vec4f tan(Vec4f const & x) {
 static inline Vec2d tan (Vec2d const & x) {
     double xx[4];
     x.store(xx);
-    return Vec2d(tan(xx[0]), tan(xx[1]));
+    return Vec2d(::tan(xx[0]), ::tan(xx[1]));
 }
 
 // inverse trigonometric functions
@@ -202,7 +207,7 @@ static inline Vec4f asin(Vec4f const & x) {
 static inline Vec2d asin (Vec2d const & x) {
     double xx[4];
     x.store(xx);
-    return Vec2d(asin(xx[0]), asin(xx[1]));
+    return Vec2d(::asin(xx[0]), ::asin(xx[1]));
 }
 
 static inline Vec4f acos(Vec4f const & x) {
@@ -213,7 +218,7 @@ static inline Vec4f acos(Vec4f const & x) {
 static inline Vec2d acos (Vec2d const & x) {
     double xx[4];
     x.store(xx);
-    return Vec2d(acos(xx[0]), acos(xx[1]));
+    return Vec2d(::acos(xx[0]), ::acos(xx[1]));
 }
 
 static inline Vec4f atan(Vec4f const & x) {
@@ -224,7 +229,7 @@ static inline Vec4f atan(Vec4f const & x) {
 static inline Vec2d atan (Vec2d const & x) {
     double xx[4];
     x.store(xx);
-    return Vec2d(atan(xx[0]), atan(xx[1]));
+    return Vec2d(::atan(xx[0]), ::atan(xx[1]));
 }
 
 static inline Vec4f atan2 (Vec4f const & a, Vec4f const & b) {   // inverse tangent of a/b
@@ -235,7 +240,7 @@ static inline Vec4f atan2 (Vec4f const & a, Vec4f const & b) {   // inverse tang
 static inline Vec2d atan2 (Vec2d const & a, Vec2d const & b) {   // inverse tangent of a/b
     double aa[4], bb[4];
     a.store(aa);  b.store(bb);
-    return Vec2d(atan2(aa[0],bb[0]), atan2(aa[1],bb[1]));
+    return Vec2d(::atan2(aa[0],bb[0]), ::atan2(aa[1],bb[1]));
 }
 #endif // VECTORMATH_COMMON_H
 
@@ -248,7 +253,7 @@ static inline Vec4f sinh(Vec4f const & x) {   // hyperbolic sine
 static inline Vec2d sinh (Vec2d const & x) {
     double xx[4];
     x.store(xx);
-    return Vec2d(sinh(xx[0]), sinh(xx[1]));
+    return Vec2d(::sinh(xx[0]), ::sinh(xx[1]));
 }
 
 static inline Vec4f cosh(Vec4f const & x) {   // hyperbolic cosine
@@ -259,7 +264,7 @@ static inline Vec4f cosh(Vec4f const & x) {   // hyperbolic cosine
 static inline Vec2d cosh (Vec2d const & x) {
     double xx[4];
     x.store(xx);
-    return Vec2d(cosh(xx[0]), cosh(xx[1]));
+    return Vec2d(::cosh(xx[0]), ::cosh(xx[1]));
 }
 
 static inline Vec4f tanh(Vec4f const & x) {   // hyperbolic tangent
@@ -270,7 +275,7 @@ static inline Vec4f tanh(Vec4f const & x) {   // hyperbolic tangent
 static inline Vec2d tanh (Vec2d const & x) {
     double xx[4];
     x.store(xx);
-    return Vec2d(tanh(xx[0]), tanh(xx[1]));
+    return Vec2d(::tanh(xx[0]), ::tanh(xx[1]));
 }
 
 // error function
@@ -278,12 +283,12 @@ static inline Vec2d tanh (Vec2d const & x) {
 static inline Vec4f erf(Vec4f const & x) {
     float xx[4];
     x.store(xx);
-    return Vec4f(erf(xx[0]), erf(xx[1]), erf(xx[2]), erf(xx[3]));
+    return Vec4f(::erf(xx[0]), ::erf(xx[1]), ::erf(xx[2]), ::erf(xx[3]));
 }
 static inline Vec2d erf (Vec2d const & x) {
     double xx[4];
     x.store(xx);
-    return Vec2d(erf(xx[0]), erf(xx[1]));
+    return Vec2d(::erf(xx[0]), ::erf(xx[1]));
 }
 #endif
 
@@ -291,12 +296,12 @@ static inline Vec2d erf (Vec2d const & x) {
 static inline Vec4f erfc(Vec4f const & x) {
     float xx[4];
     x.store(xx);
-    return Vec4f(erfc(xx[0]), erfc(xx[1]), erfc(xx[2]), erfc(xx[3]));
+    return Vec4f(::erfc(xx[0]), ::erfc(xx[1]), ::erfc(xx[2]), ::erfc(xx[3]));
 }
 static inline Vec2d erfc (Vec2d const & x) {
     double xx[4];
     x.store(xx);
-    return Vec2d(erfc(xx[0]), erfc(xx[1]));
+    return Vec2d(::erfc(xx[0]), ::erfc(xx[1]));
 }
 #endif
 
@@ -312,8 +317,8 @@ static inline Vec4f cexp (Vec4f const & x) {   // complex exponential function
 static inline Vec2d cexp (Vec2d const & x) {   // complex exponential function
     double xx[2];
     x.store(xx);
-    Vec2d z(cos(xx[1]), sin(xx[1]));
-    return z * exp(xx[0]);
+    Vec2d z(::cos(xx[1]), ::sin(xx[1]));
+    return z * ::exp(xx[0]);
 }
 
 #if defined (VECTORF256_H)  // 256 bit vectors defined
@@ -440,11 +445,11 @@ static inline Vec4d atan (Vec4d const & x) {   // inverse tangent
     return Vec4d(atan(x.get_low()), atan(x.get_high()));
 }
 
-static inline Vec8f atan (Vec8f const & a, Vec8f const & b) {   // inverse tangent of a/b
-    return Vec8f(atan(a.get_low(),b.get_low()), atan(a.get_high(),b.get_high()));
+static inline Vec8f atan2 (Vec8f const & a, Vec8f const & b) {   // inverse tangent of a/b
+    return Vec8f(atan2(a.get_low(),b.get_low()), atan2(a.get_high(),b.get_high()));
 }
-static inline Vec4d atan (Vec4d const & a, Vec4d const & b) {   // inverse tangent of a/b
-    return Vec4d(atan(a.get_low(),b.get_low()), atan(a.get_high(),b.get_high()));
+static inline Vec4d atan2 (Vec4d const & a, Vec4d const & b) {   // inverse tangent of a/b
+    return Vec4d(atan2(a.get_low(),b.get_low()), atan2(a.get_high(),b.get_high()));
 }
 #endif // VECTORMATH_COMMON_H
 
@@ -2104,4 +2109,8 @@ static inline Vec4d cexp (Vec4d const & x) {   // complex exponential function
 
 #endif // VECTORF256_H == 1
 
+#ifdef VCL_NAMESPACE
+}
+#endif
+
 #endif // VECTORMATH_LIB_H
diff --git a/vectorclass/vectormath_trig.h b/vectorclass/vectormath_trig.h
index 986d2e4..11ab7e3 100755
--- a/vectorclass/vectormath_trig.h
+++ b/vectorclass/vectormath_trig.h
@@ -1,8 +1,8 @@
 /****************************  vectormath_trig.h   ******************************
 * Author:        Agner Fog
 * Date created:  2014-04-18
-* Last modified: 2015-02-10
-* Version:       1.16
+* Last modified: 2016-05-02
+* Version:       1.22
 * Project:       vector classes
 * Description:
 * Header file containing inline version of trigonometric functions 
@@ -20,7 +20,7 @@
 *
 * For detailed instructions, see vectormath_common.h and VectorClass.pdf
 *
-* (c) Copyright 2015 GNU General Public License http://www.gnu.org/licenses
+* (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses
 ******************************************************************************/
 
 #ifndef VECTORMATH_TRIG_H
@@ -28,6 +28,10 @@
 
 #include "vectormath_common.h"
 
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
 // Different overloaded functions for template resolution.
 // These are used to fix the problem that the quadrant index uses
 // a vector of 32-bit integers which doesn't fit the size of the
@@ -40,7 +44,7 @@
 
 // define overloaded truncate functions
 static inline Vec4i vm_truncate_low_to_int(Vec2d const & x) {
-    return truncate_to_int(x,x);
+    return truncate_to_int(x, x);
 }
 
 #if MAX_VECTOR_SIZE >= 256
@@ -85,21 +89,21 @@ template<class ITYPE, class ITYPEH>
 static inline ITYPE vm_half_int_vector_to_full(ITYPEH const & x);
 
 template<>
-inline Vec2q vm_half_int_vector_to_full<Vec2q,Vec4i>(Vec4i const & x) {
+inline Vec2q vm_half_int_vector_to_full<Vec2q, Vec4i>(Vec4i const & x) {
     return extend_low(x);
 }
 
 #if MAX_VECTOR_SIZE >= 256
 template<>
-inline Vec4q vm_half_int_vector_to_full<Vec4q,Vec4i>(Vec4i const & x) {
-    return extend_low(Vec8i(x,x));
+inline Vec4q vm_half_int_vector_to_full<Vec4q, Vec4i>(Vec4i const & x) {
+    return extend_low(Vec8i(x, x));
 }
 #endif // MAX_VECTOR_SIZE >= 256
 
 #if MAX_VECTOR_SIZE >= 512
 template<>
-inline Vec8q vm_half_int_vector_to_full<Vec8q,Vec8i>(Vec8i const & x) {
-    return extend_low(Vec16i(x,x));
+inline Vec8q vm_half_int_vector_to_full<Vec8q, Vec8i>(Vec8i const & x) {
+    return extend_low(Vec16i(x, x));
 }
 #endif // MAX_VECTOR_SIZE >= 512
 
@@ -117,25 +121,25 @@ inline Vec8q vm_half_int_vector_to_full<Vec8q,Vec8i>(Vec8i const & x) {
 // Paramterers:
 // xx = input x (radians)
 // cosret = return pointer (only if SC = 3)
-template<class VTYPE, class ITYPE, class ITYPEH, class BVTYPE, int SC> 
+template<class VTYPE, class ITYPE, class ITYPEH, class BVTYPE, int SC>
 static inline VTYPE sincos_d(VTYPE * cosret, VTYPE const & xx) {
 
     // define constants
-    const double ONEOPIO4 = 4./VM_PI;
+    const double ONEOPIO4 = 4. / VM_PI;
 
-    const double P0sin =-1.66666666666666307295E-1;
+    const double P0sin = -1.66666666666666307295E-1;
     const double P1sin = 8.33333333332211858878E-3;
-    const double P2sin =-1.98412698295895385996E-4;
+    const double P2sin = -1.98412698295895385996E-4;
     const double P3sin = 2.75573136213857245213E-6;
-    const double P4sin =-2.50507477628578072866E-8;
+    const double P4sin = -2.50507477628578072866E-8;
     const double P5sin = 1.58962301576546568060E-10;
 
     const double P0cos = 4.16666666666665929218E-2;
-    const double P1cos =-1.38888888888730564116E-3;
+    const double P1cos = -1.38888888888730564116E-3;
     const double P2cos = 2.48015872888517045348E-5;
-    const double P3cos =-2.75573141792967388112E-7;
+    const double P3cos = -2.75573141792967388112E-7;
     const double P4cos = 2.08757008419747316778E-9;
-    const double P5cos =-1.13585365213876817300E-11;
+    const double P5cos = -1.13585365213876817300E-11;
 
     const double DP1 = 7.853981554508209228515625E-1;
     const double DP2 = 7.94662735614792836714E-9;
@@ -176,7 +180,7 @@ static inline VTYPE sincos_d(VTYPE * cosret, VTYPE const & xx) {
     c = mul_add(x2 * x2, c, nmul_add(x2, 0.5, 1.0));                 // c = 1.0 - x2 * 0.5 + (x2 * x2) * c;
 
     // correct for quadrant
-    qq = vm_half_int_vector_to_full<ITYPE,ITYPEH>(q);
+    qq = vm_half_int_vector_to_full<ITYPE, ITYPEH>(q);
     swap = BVTYPE((qq & 2) != 0);
 
     // check for overflow
@@ -256,23 +260,23 @@ static inline Vec8d sincos(Vec8d * cosret, Vec8d const & x) {
 // Paramterers:
 // xx = input x (radians)
 // cosret = return pointer (only if SC = 3)
-template<class VTYPE, class ITYPE, class BVTYPE, int SC> 
+template<class VTYPE, class ITYPE, class BVTYPE, int SC>
 static inline VTYPE sincos_f(VTYPE * cosret, VTYPE const & xx) {
 
     // define constants
-    const float ONEOPIO4f = (float)(4./VM_PI);
+    const float ONEOPIO4f = (float)(4. / VM_PI);
 
     const float DP1F = 0.78515625f;
     const float DP2F = 2.4187564849853515625E-4f;
-    const float DP3F = 3.77489497744594108E-8f; 
+    const float DP3F = 3.77489497744594108E-8f;
 
     const float P0sinf = -1.6666654611E-1f;
-    const float P1sinf =  8.3321608736E-3f;
+    const float P1sinf = 8.3321608736E-3f;
     const float P2sinf = -1.9515295891E-4f;
 
-    const float P0cosf =  4.166664568298827E-2f;
+    const float P0cosf = 4.166664568298827E-2f;
     const float P1cosf = -1.388731625493765E-3f;
-    const float P2cosf =  2.443315711809948E-5f;
+    const float P2cosf = 2.443315711809948E-5f;
 
     VTYPE  xa, x, y, x2, s, c, sin1, cos1;  // data vectors
     ITYPE  q, signsin, signcos;             // integer vectors
@@ -299,7 +303,7 @@ static inline VTYPE sincos_f(VTYPE * cosret, VTYPE const & xx) {
 
     // Taylor expansion of sin and cos, valid for -pi/4 <= x <= pi/4
     x2 = x * x;
-    s = polynomial_2(x2, P0sinf, P1sinf, P2sinf) * (x*x2)  + x;
+    s = polynomial_2(x2, P0sinf, P1sinf, P2sinf) * (x*x2) + x;
     c = polynomial_2(x2, P0cosf, P1cosf, P2cosf) * (x2*x2) + nmul_add(0.5f, x2, 1.0f);
 
     // correct for quadrant
@@ -322,7 +326,7 @@ static inline VTYPE sincos_f(VTYPE * cosret, VTYPE const & xx) {
         signcos = ((q + 2) << 29) & (1 << 31);
         cos1 ^= reinterpret_f(signcos);
     }
-    if      (SC == 1) return sin1;
+    if (SC == 1) return sin1;
     else if (SC == 2) return cos1;
     else if (SC == 3) {  // calculate both. cos returned through pointer
         *cosret = cos1;
@@ -396,19 +400,19 @@ static inline Vec16f tan(Vec16f const & x) {
 // BVTYPE: boolean vector type
 // Paramterers:
 // x = input x (radians)
-template<class VTYPE, class ITYPE, class ITYPEH, class BVTYPE> 
+template<class VTYPE, class ITYPE, class ITYPEH, class BVTYPE>
 static inline VTYPE tan_d(VTYPE const & x) {
 
     // define constants
-    const double ONEOPIO4 = 4./VM_PI;
+    const double ONEOPIO4 = 4. / VM_PI;
 
     const double DP1 = 7.853981554508209228515625E-1;
     const double DP2 = 7.94662735614792836714E-9;
     const double DP3 = 3.06161699786838294307E-17;
 
-    const double P2tan=-1.30936939181383777646E4;
-    const double P1tan=1.15351664838587416140E6;
-    const double P0tan=-1.79565251976484877988E7;
+    const double P2tan = -1.30936939181383777646E4;
+    const double P1tan = 1.15351664838587416140E6;
+    const double P0tan = -1.79565251976484877988E7;
 
     const double Q3tan = 1.36812963470692954678E4;
     const double Q2tan = -1.32089234440210967447E6;
@@ -439,14 +443,14 @@ static inline VTYPE tan_d(VTYPE const & x) {
 
     // Pade expansion of tan, valid for -pi/4 <= x <= pi/4
     zz = z * z;
-    px = polynomial_2 (zz, P0tan, P1tan, P2tan);
+    px = polynomial_2(zz, P0tan, P1tan, P2tan);
     qx = polynomial_4n(zz, Q0tan, Q1tan, Q2tan, Q3tan);
 
     // qx cannot be 0 for x <= pi/4
     tn = mul_add(px / qx, z * zz, z);            // tn = z + z * zz * px / qx;
 
     // if (q&2) tn = -1/tn
-    qq = vm_half_int_vector_to_full<ITYPE,ITYPEH>(q);
+    qq = vm_half_int_vector_to_full<ITYPE, ITYPEH>(q);
     doinvert = BVTYPE((qq & 2) != 0);
     xzero = (xa == 0.);
     // avoid division by 0. We will not be using recip anyway if xa == 0.
@@ -472,20 +476,20 @@ static inline Vec2d tan(Vec2d const & x) {
 }
 
 #if MAX_VECTOR_SIZE >= 256
-static inline Vec4d tan(Vec4d const & x) { 
+static inline Vec4d tan(Vec4d const & x) {
     return tan_d<Vec4d, Vec4q, Vec4i, Vec4db>(x);
 }
 #endif // MAX_VECTOR_SIZE >= 256
 
 #if MAX_VECTOR_SIZE >= 512
-static inline Vec8d tan(Vec8d const & x) { 
+static inline Vec8d tan(Vec8d const & x) {
     return tan_d<Vec8d, Vec8q, Vec8i, Vec8db>(x);
 }
 #endif // MAX_VECTOR_SIZE >= 512
 
 
 /*
-This is removed for the single precision version. 
+This is removed for the single precision version.
 It is faster to use tan(x) = sin(x)/cos(x)
 
 // *************************************************************
@@ -498,7 +502,7 @@ It is faster to use tan(x) = sin(x)/cos(x)
 // Paramterers:
 // x = input x (radians)
 // cosret = return pointer (only if SC = 3)
-template<class VTYPE, class ITYPE, class BVTYPE> 
+template<class VTYPE, class ITYPE, class BVTYPE>
 static inline VTYPE tan_f(VTYPE const & x) {
 
     // define constants
@@ -544,7 +548,7 @@ static inline VTYPE tan_f(VTYPE const & x) {
     doinvert = (q & 2) != 0;
     xzero = (xa == 0.f);
     // avoid division by 0. We will not be using recip anyway if xa == 0.
-    // tn never becomes exactly 0 when x = pi/2 so we only have to make 
+    // tn never becomes exactly 0 when x = pi/2 so we only have to make
     // a special case for x == 0.
     recip = (-1.f) / select(xzero, VTYPE(-1.f), tn);
     tn = select(doinvert, recip, tn);
@@ -557,7 +561,7 @@ static inline VTYPE tan_f(VTYPE const & x) {
 
 static inline Vec4f tan(Vec4f const & x) {
     return tan_f<Vec4f, Vec4i, Vec4fb>(x);
-} 
+}
 
 static inline Vec8f tan(Vec8f const & x) {
     return tan_f<Vec8f, Vec8i, Vec8fb>(x);
@@ -573,39 +577,39 @@ static inline Vec8f tan(Vec8f const & x) {
 // AC: 0 = asin, 1 = acos
 // Paramterers:
 // x = input x
-template<class VTYPE, class BVTYPE, int AC> 
+template<class VTYPE, class BVTYPE, int AC>
 static inline VTYPE asin_d(VTYPE const & x) {
 
     // define constants
-    const double R4asin =  2.967721961301243206100E-3;
+    const double R4asin = 2.967721961301243206100E-3;
     const double R3asin = -5.634242780008963776856E-1;
-    const double R2asin =  6.968710824104713396794E0;
+    const double R2asin = 6.968710824104713396794E0;
     const double R1asin = -2.556901049652824852289E1;
-    const double R0asin =  2.853665548261061424989E1;
+    const double R0asin = 2.853665548261061424989E1;
 
     const double S3asin = -2.194779531642920639778E1;
-    const double S2asin =  1.470656354026814941758E2;
+    const double S2asin = 1.470656354026814941758E2;
     const double S1asin = -3.838770957603691357202E2;
-    const double S0asin =  3.424398657913078477438E2;
+    const double S0asin = 3.424398657913078477438E2;
 
-    const double P5asin =  4.253011369004428248960E-3;
+    const double P5asin = 4.253011369004428248960E-3;
     const double P4asin = -6.019598008014123785661E-1;
-    const double P3asin =  5.444622390564711410273E0;
+    const double P3asin = 5.444622390564711410273E0;
     const double P2asin = -1.626247967210700244449E1;
-    const double P1asin =  1.956261983317594739197E1;
+    const double P1asin = 1.956261983317594739197E1;
     const double P0asin = -8.198089802484824371615E0;
 
     const double Q4asin = -1.474091372988853791896E1;
-    const double Q3asin =  7.049610280856842141659E1;
+    const double Q3asin = 7.049610280856842141659E1;
     const double Q2asin = -1.471791292232726029859E2;
-    const double Q1asin =  1.395105614657485689735E2;
+    const double Q1asin = 1.395105614657485689735E2;
     const double Q0asin = -4.918853881490881290097E1;
 
     VTYPE  xa, xb, x1, x2, x3, x4, x5, px, qx, rx, sx, vx, wx, y1, yb, z, z1, z2;
     BVTYPE big;
     bool   dobig, dosmall;
 
-    xa  = abs(x);
+    xa = abs(x);
     big = xa >= 0.625;
 
     /*
@@ -614,7 +618,7 @@ static inline VTYPE asin_d(VTYPE const & x) {
     x = xa * xa;
     px = PX(x);
     qx = QX(x);
-    y1 = x*px/qx;    
+    y1 = x*px/qx;
     y1 = xa * y1 + xa;
 
     Big: xa >= 0.625
@@ -630,7 +634,7 @@ static inline VTYPE asin_d(VTYPE const & x) {
 
     // select a common x for all polynomials
     // This allows sharing of powers of x through common subexpression elimination
-    x1 = select(big, 1.0 - xa, xa * xa); 
+    x1 = select(big, 1.0 - xa, xa * xa);
 
     // calculate powers of x1 outside branches to make sure they are only calculated once
     x2 = x1 * x1;
@@ -639,30 +643,30 @@ static inline VTYPE asin_d(VTYPE const & x) {
     x3 = x2 * x1;
 
     dosmall = !horizontal_and(big);   // at least one element is small
-    dobig   =  horizontal_or(big) ;   // at least one element is big
+    dobig = horizontal_or(big);   // at least one element is big
 
     // calculate polynomials (reuse powers of x)
     if (dosmall) {
         // px = polynomial_5 (x1, P0asin, P1asin, P2asin, P3asin, P4asin, P5asin);
         // qx = polynomial_5n(x1, Q0asin, Q1asin, Q2asin, Q3asin, Q4asin);
-        px = mul_add(x3,P3asin,P0asin) + mul_add(x4,P4asin,x1*P1asin) + mul_add(x5,P5asin,x2*P2asin);
-        qx = mul_add(x4,Q4asin,x5) + mul_add(x3,Q3asin,x1*Q1asin) + mul_add(x2,Q2asin,Q0asin);
+        px = mul_add(x3, P3asin, P0asin) + mul_add(x4, P4asin, x1*P1asin) + mul_add(x5, P5asin, x2*P2asin);
+        qx = mul_add(x4, Q4asin, x5) + mul_add(x3, Q3asin, x1*Q1asin) + mul_add(x2, Q2asin, Q0asin);
     }
     if (dobig) {
         // rx = polynomial_4 (x1, R0asin, R1asin, R2asin, R3asin, R4asin);
         // sx = polynomial_4n(x1, S0asin, S1asin, S2asin, S3asin);
-        rx = mul_add(x3,R3asin,x2*R2asin) + mul_add(x4,R4asin,mul_add(x1,R1asin,R0asin));
-        sx = mul_add(x3,S3asin,x4) + mul_add(x2,S2asin,mul_add(x1,S1asin,S0asin));
+        rx = mul_add(x3, R3asin, x2*R2asin) + mul_add(x4, R4asin, mul_add(x1, R1asin, R0asin));
+        sx = mul_add(x3, S3asin, x4) + mul_add(x2, S2asin, mul_add(x1, S1asin, S0asin));
     }
 
     // select and divide outside branches to avoid dividing twice
     vx = select(big, rx, px);
     wx = select(big, sx, qx);
-    y1 = vx/wx * x1;
+    y1 = vx / wx * x1;
 
     // results for big
     if (dobig) {                                 // avoid square root if all are small
-        xb = sqrt(x1+x1);                        // this produces NAN if xa > 1 so we don't need a special case for xa > 1
+        xb = sqrt(x1 + x1);                        // this produces NAN if xa > 1 so we don't need a special case for xa > 1
         z1 = mul_add(xb, y1, xb);                // yb = xb * y1; z1 = xb + yb;
     }
 
@@ -694,21 +698,21 @@ static inline Vec2d acos(Vec2d const & x) {
 }
 
 #if MAX_VECTOR_SIZE >= 256
-static inline Vec4d asin(Vec4d const & x) { 
+static inline Vec4d asin(Vec4d const & x) {
     return asin_d<Vec4d, Vec4db, 0>(x);
 }
 
-static inline Vec4d acos(Vec4d const & x) { 
+static inline Vec4d acos(Vec4d const & x) {
     return asin_d<Vec4d, Vec4db, 1>(x);
 }
 #endif // MAX_VECTOR_SIZE >= 256
 
 #if MAX_VECTOR_SIZE >= 512
-static inline Vec8d asin(Vec8d const & x) { 
+static inline Vec8d asin(Vec8d const & x) {
     return asin_d<Vec8d, Vec8db, 0>(x);
 }
 
-static inline Vec8d acos(Vec8d const & x) { 
+static inline Vec8d acos(Vec8d const & x) {
     return asin_d<Vec8d, Vec8db, 1>(x);
 }
 #endif // MAX_VECTOR_SIZE >= 512
@@ -723,7 +727,7 @@ static inline Vec8d acos(Vec8d const & x) {
 // AC: 0 = asin, 1 = acos
 // Paramterers:
 // x = input x
-template<class VTYPE, class BVTYPE, int AC> 
+template<class VTYPE, class BVTYPE, int AC>
 static inline VTYPE asin_f(VTYPE const & x) {
 
     // define constants
@@ -736,11 +740,11 @@ static inline VTYPE asin_f(VTYPE const & x) {
     VTYPE  xa, x1, x2, x3, x4, xb, z, z1, z2;
     BVTYPE big;
 
-    xa  = abs(x);
+    xa = abs(x);
     big = xa > 0.5f;
 
     x1 = 0.5f * (1.0f - xa);
-    x2 = xa * xa;        
+    x2 = xa * xa;
     x3 = select(big, x1, x2);
 
     //if (horizontal_or(big)) 
@@ -757,12 +761,12 @@ static inline VTYPE asin_f(VTYPE const & x) {
     if (AC) {  // acos
         z1 = select(x < 0., float(VM_PI) - z1, z1);
         z2 = float(VM_PI_2) - sign_combine(z, x);
-        z  = select(big, z1, z2);
+        z = select(big, z1, z2);
     }
     else {     // asin
         z1 = float(VM_PI_2) - z1;
-        z  = select(big, z1, z);
-        z  = sign_combine(z, x);
+        z = select(big, z1, z);
+        z = sign_combine(z, x);
     }
 
     return z;
@@ -779,19 +783,19 @@ static inline Vec4f acos(Vec4f const & x) {
 }
 
 #if MAX_VECTOR_SIZE >= 256
-static inline Vec8f asin(Vec8f const & x) { 
+static inline Vec8f asin(Vec8f const & x) {
     return asin_f<Vec8f, Vec8fb, 0>(x);
 }
-static inline Vec8f acos(Vec8f const & x) { 
+static inline Vec8f acos(Vec8f const & x) {
     return asin_f<Vec8f, Vec8fb, 1>(x);
 }
 #endif // MAX_VECTOR_SIZE >= 256
 
 #if MAX_VECTOR_SIZE >= 512
-static inline Vec16f asin(Vec16f const & x) { 
+static inline Vec16f asin(Vec16f const & x) {
     return asin_f<Vec16f, Vec16fb, 0>(x);
 }
-static inline Vec16f acos(Vec16f const & x) { 
+static inline Vec16f acos(Vec16f const & x) {
     return asin_f<Vec16f, Vec16fb, 1>(x);
 }
 #endif // MAX_VECTOR_SIZE >= 512
@@ -809,7 +813,7 @@ static inline Vec16f acos(Vec16f const & x) {
 // result is between -pi/2 and +pi/2 when x > 0
 // result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2
 // atan2(0,0) gives NAN. Future versions may give 0
-template<class VTYPE, class BVTYPE, int T2> 
+template<class VTYPE, class BVTYPE, int T2>
 static inline VTYPE atan_d(VTYPE const & y, VTYPE const & x) {
 
     // define constants
@@ -818,17 +822,17 @@ static inline VTYPE atan_d(VTYPE const & y, VTYPE const & x) {
     const double MOREBITSO2 = MOREBITS * 0.5;
     const double T3PO8 = VM_SQRT2 + 1.; // 2.41421356237309504880;
 
-	const double P4atan = -8.750608600031904122785E-1;
-	const double P3atan = -1.615753718733365076637E1;
-	const double P2atan = -7.500855792314704667340E1;
-	const double P1atan = -1.228866684490136173410E2;
-	const double P0atan = -6.485021904942025371773E1;
+    const double P4atan = -8.750608600031904122785E-1;
+    const double P3atan = -1.615753718733365076637E1;
+    const double P2atan = -7.500855792314704667340E1;
+    const double P1atan = -1.228866684490136173410E2;
+    const double P0atan = -6.485021904942025371773E1;
 
-	const double Q4atan = 2.485846490142306297962E1;
-	const double Q3atan = 1.650270098316988542046E2;
-	const double Q2atan = 4.328810604912902668951E2;
-	const double Q1atan = 4.853903996359136964868E2;
-	const double Q0atan = 1.945506571482613964425E2;
+    const double Q4atan = 2.485846490142306297962E1;
+    const double Q3atan = 1.650270098316988542046E2;
+    const double Q2atan = 4.328810604912902668951E2;
+    const double Q1atan = 4.853903996359136964868E2;
+    const double Q0atan = 1.945506571482613964425E2;
 
     VTYPE  t, x1, x2, y1, y2, s, fac, a, b, z, zz, px, qx, re;  // data vectors
     BVTYPE swapxy, notbig, notsmal;                             // boolean vectors
@@ -840,8 +844,8 @@ static inline VTYPE atan_d(VTYPE const & y, VTYPE const & x) {
         swapxy = (y1 > x1);
         // swap x and y if y1 > x1
         x2 = select(swapxy, y1, x1);
-        y2 = select(swapxy, x1, y1);        
-        t  = y2 / x2;                  // x = y = 0 gives NAN here
+        y2 = select(swapxy, x1, y1);
+        t = y2 / x2;                  // x = y = 0 gives NAN here
     }
     else {    // atan(y)
         t = abs(y);
@@ -850,7 +854,7 @@ static inline VTYPE atan_d(VTYPE const & y, VTYPE const & x) {
     // small:  t < 0.66
     // medium: 0.66 <= t <= 2.4142 (1+sqrt(2))
     // big:    t > 2.4142
-    notbig  = t <= T3PO8;  // t <= 2.4142
+    notbig = t <= T3PO8;  // t <= 2.4142
     notsmal = t >= 0.66;   // t >= 0.66
 
     s = select(notbig, VTYPE(VM_PI_4), VTYPE(VM_PI_2));
@@ -869,7 +873,7 @@ static inline VTYPE atan_d(VTYPE const & y, VTYPE const & x) {
 
     zz = z * z;
 
-    px = polynomial_4 (zz, P0atan, P1atan, P2atan, P3atan, P4atan);
+    px = polynomial_4(zz, P0atan, P1atan, P2atan, P3atan, P4atan);
     qx = polynomial_5n(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan);
 
     re = mul_add(px / qx, z * zz, z);            // re = (px / qx) * (z * zz) + z;
@@ -878,7 +882,7 @@ static inline VTYPE atan_d(VTYPE const & y, VTYPE const & x) {
     if (T2) {  // atan2(y,x)
         // move back in place
         re = select(swapxy, VM_PI_2 - re, re);
-        re = select(x < 0., VM_PI   - re, re);
+        re = select(x < 0., VM_PI - re, re);
         re = select((x | y) == 0., 0., re);      // atan2(0,0) = 0 by convention
     }
     // get sign bit
@@ -931,13 +935,13 @@ static inline Vec8d atan(Vec8d const & y) {
 // result is between -pi/2 and +pi/2 when x > 0
 // result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2
 // atan2(0,0) gives NAN. Future versions may give 0
-template<class VTYPE, class BVTYPE, int T2> 
+template<class VTYPE, class BVTYPE, int T2>
 static inline VTYPE atan_f(VTYPE const & y, VTYPE const & x) {
 
     // define constants
-    const float P3atanf =  8.05374449538E-2f;
+    const float P3atanf = 8.05374449538E-2f;
     const float P2atanf = -1.38776856032E-1f;
-    const float P1atanf =  1.99777106478E-1f;
+    const float P1atanf = 1.99777106478E-1f;
     const float P0atanf = -3.33329491539E-1f;
 
     VTYPE  t, x1, x2, y1, y2, s, a, b, z, zz, re;  // data vectors
@@ -953,7 +957,7 @@ static inline VTYPE atan_f(VTYPE const & y, VTYPE const & x) {
         y2 = select(swapxy, x1, y1);
 
         // do we need to protect against x = y = 0? It will just produce NAN, probably without delay
-        t  = y2 / x2;
+        t = y2 / x2;
     }
     else {    // atan(y)
         t = abs(y);
@@ -963,8 +967,8 @@ static inline VTYPE atan_f(VTYPE const & y, VTYPE const & x) {
     // medium: 0.4142 <= t <= 2.4142
     // big:    t > 2.4142  (not for atan2)
     if (!T2) {  // atan(y)
-        notsmal = t >= float(VM_SQRT2-1.);       // t >= tan  pi/8
-        notbig  = t <= float(VM_SQRT2+1.);       // t <= tan 3pi/8
+        notsmal = t >= float(VM_SQRT2 - 1.);       // t >= tan  pi/8
+        notbig = t <= float(VM_SQRT2 + 1.);       // t <= tan 3pi/8
 
         s = select(notbig, VTYPE(float(VM_PI_4)), VTYPE(float(VM_PI_2)));
         s = notsmal & s;      // select(notsmal, s, 0.);
@@ -981,7 +985,7 @@ static inline VTYPE atan_f(VTYPE const & y, VTYPE const & x) {
     else {  // atan2(y,x)
         // small:  z = t / 1.0;
         // medium: z = (t-1.0) / (t+1.0);
-        notsmal = t >= float(VM_SQRT2-1.); 
+        notsmal = t >= float(VM_SQRT2 - 1.);
         a = if_add(notsmal, t, -1.f);
         b = if_add(notsmal, 1.f, t);
         s = notsmal & VTYPE(float(VM_PI_4));
@@ -997,7 +1001,7 @@ static inline VTYPE atan_f(VTYPE const & y, VTYPE const & x) {
     if (T2) {  // atan2(y,x)
         // move back in place
         re = select(swapxy, float(VM_PI_2) - re, re);
-        re = select(x < 0., float(VM_PI)   - re, re);
+        re = select(x < 0., float(VM_PI) - re, re);
         re = select((x | y) == 0.f, 0.f, re);    // atan2(0,0) = 0 by convention
     }
     // get sign bit
@@ -1038,4 +1042,8 @@ static inline Vec16f atan(Vec16f const & y) {
 
 #endif // MAX_VECTOR_SIZE >= 512
 
+#ifdef VCL_NAMESPACE
+}
+#endif
+
 #endif
diff --git a/vectorf64.h b/vectorf64.h
new file mode 100644
index 0000000..ed8f73e
--- /dev/null
+++ b/vectorf64.h
@@ -0,0 +1,377 @@
+/*
+
+    Abstract class for 64-bit floating point numbers
+    Based on vectorclass (VCL), mainly for templating with double
+
+    @author: minh
+    @date:   2016-09-24
+
+
+*/
+
+#ifndef VECTORF64_H
+#define VECTORF64_H
+
+
+//typedef int64_t Vec1db;
+//typedef bool Vec1db;
+
+
+/*****************************************************************************
+*
+*          Vec1db: Vector of 1 Booleans for use with Vec1d
+*
+*****************************************************************************/
+
+class Vec1db {
+public:
+    bool xmm; // Double vector
+    // Default constructor:
+    Vec1db() {
+    }
+    // Constructor to broadcast scalar value:
+    Vec1db(bool b) {
+        xmm = b;
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec1db & operator = (bool b) {
+        *this = Vec1db(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec1db(int b);
+    Vec1db & operator = (int x);
+public:
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec1db const & insert(uint32_t index, bool value) {
+        xmm = value;
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        return xmm;
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    static int size() {
+        return 1;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec1db
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec1db operator & (Vec1db const & a, Vec1db const & b) {
+    return Vec1db(a.xmm && b.xmm);
+}
+static inline Vec1db operator && (Vec1db const & a, Vec1db const & b) {
+    return Vec1db(a.xmm && b.xmm);
+}
+
+// vector operator &= : bitwise and
+static inline Vec1db & operator &= (Vec1db & a, Vec1db const & b) {
+    a = a & b;
+    return a;
+}
+
+
+
+
+/*****************************************************************************
+*
+*          Vec1d: Vector of 1 double precision floating point values
+*
+*****************************************************************************/
+
+class Vec1d {
+public:
+    double xmm; // double vector
+    // Default constructor:
+    Vec1d() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec1d(double d) {
+        xmm = d;
+    }
+
+    // Member function to load from array (unaligned)
+    Vec1d & load(double const * p) {
+        xmm = *p;
+        return *this;
+    }
+    // Member function to load from array, aligned by 8
+    Vec1d const & load_a(double const * p) {
+        xmm = *p;
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec1d & load_partial(int n, double const * p) {
+        switch (n) {
+        case 1:
+            xmm = *p; break;
+        default:
+            xmm = 0.0;
+        }
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(double * p) const {
+        *p = xmm;
+    }
+    // Member function to store into array, aligned by 8
+    void store_a(double * p) const {
+        *p = xmm;
+    }
+
+    // cut off vector to n elements. The last 4-n elements are set to zero
+    Vec1d & cutoff(int n) {
+        if (n == 0)
+            xmm = 0.0;
+        return *this;
+    }
+
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec1d const & insert(uint32_t index, double value) {
+        xmm = value;
+        return *this;
+    };
+    // Member function extract a single element from vector
+    double extract(uint32_t index) const {
+        return xmm;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    double operator [] (uint32_t index) const {
+        return extract(index);
+    }
+
+    static int size() {
+        return 1;
+    }
+};
+
+/*****************************************************************************
+*
+*          Operators for Vec1d
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec1d operator + (Vec1d const & a, Vec1d const & b) {
+    return Vec1d(a.xmm + b.xmm);
+}
+
+// vector operator + : add vector and scalar
+static inline Vec1d operator + (Vec1d const & a, double b) {
+    return a + Vec1d(b);
+}
+static inline Vec1d operator + (double a, Vec1d const & b) {
+    return Vec1d(a) + b;
+}
+
+// vector operator += : add
+static inline Vec1d & operator += (Vec1d & a, Vec1d const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec1d operator ++ (Vec1d & a, int) {
+    Vec1d a0 = a;
+    a = a + 1.0;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec1d & operator ++ (Vec1d & a) {
+    a = a + 1.0;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec1d operator - (Vec1d const & a, Vec1d const & b) {
+    return Vec1d(a.xmm - b.xmm);
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec1d operator - (Vec1d const & a, double b) {
+    return a - Vec1d(b);
+}
+static inline Vec1d operator - (double a, Vec1d const & b) {
+    return Vec1d(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec1d operator - (Vec1d const & a) {
+    return Vec1d(-a.xmm);
+}
+
+// vector operator -= : subtract
+static inline Vec1d & operator -= (Vec1d & a, Vec1d const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec1d operator -- (Vec1d & a, int) {
+    Vec1d a0 = a;
+    a = a - 1.0;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec1d & operator -- (Vec1d & a) {
+    a = a - 1.0;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec1d operator * (Vec1d const & a, Vec1d const & b) {
+    return Vec1d(a.xmm * b.xmm);
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec1d operator * (Vec1d const & a, double b) {
+    return a * Vec1d(b);
+}
+static inline Vec1d operator * (double a, Vec1d const & b) {
+    return Vec1d(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec1d & operator *= (Vec1d & a, Vec1d const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec1d operator / (Vec1d const & a, Vec1d const & b) {
+    return Vec1d(a.xmm/b.xmm);
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec1d operator / (Vec1d const & a, double b) {
+    return a / Vec1d(b);
+}
+static inline Vec1d operator / (double a, Vec1d const & b) {
+    return Vec1d(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec1d & operator /= (Vec1d & a, Vec1d const & b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec1db operator == (Vec1d const & a, Vec1d const & b) {
+    return Vec1db(a.xmm == b.xmm);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec1db operator != (Vec1d const & a, Vec1d const & b) {
+    return Vec1db(a.xmm != b.xmm);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec1db operator < (Vec1d const & a, Vec1d const & b) {
+    return Vec1db(a.xmm < b.xmm);
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec1db operator <= (Vec1d const & a, Vec1d const & b) {
+    return Vec1db(a.xmm <= b.xmm);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec1db operator > (Vec1d const & a, Vec1d const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec1db operator >= (Vec1d const & a, Vec1d const & b) {
+    return b <= a;
+}
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add (Vec1d const & a) {
+    return a.xmm;
+}
+
+// function max: a > b ? a : b
+static inline Vec1d max(Vec1d const & a, Vec1d const & b) {
+    return max(a.xmm,b.xmm);
+}
+
+// function min: a < b ? a : b
+static inline Vec1d min(Vec1d const & a, Vec1d const & b) {
+    return min(a.xmm,b.xmm);
+}
+
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec1d abs(Vec1d const & a) {
+    return Vec1d(fabs(a.xmm));
+}
+
+// function log: logarithm
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec1d log(Vec1d const & a) {
+    return Vec1d(log(a.xmm));
+}
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec1d mul_add(Vec1d const & a, Vec1d const & b, Vec1d const & c) {
+    return a * b + c;
+}
+
+// Multiply and subtract
+static inline Vec1d mul_sub(Vec1d const & a, Vec1d const & b, Vec1d const & c) {
+    return a * b - c;
+}
+
+// Multiply and inverse subtract
+static inline Vec1d nmul_add(Vec1d const & a, Vec1d const & b, Vec1d const & c) {
+    return c - a * b;
+}
+
+
+/*****************************************************************************
+*
+*          Horizontal Boolean functions
+*
+*****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec1db const & a) {
+    return a.xmm;
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec1db const & a) {
+    return a.xmm;
+}
+
+// instances of exp_d template
+static inline Vec1d exp(Vec1d const & x) {
+    return Vec1d(exp(x.xmm));
+}
+
+
+
+#endif //VECTORF64_H

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/iqtree.git



More information about the debian-med-commit mailing list