[med-svn] [iqtree] 01/08: Imported Upstream version 1.3.11.1+dfsg

Thu Dec 17 16:18:59 UTC 2015

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository iqtree.

commit b24d046b8a9597e8beaf30ea7822b06c0327875d
Author: Andreas Tille <tille at debian.org>
Date:   Thu Dec 17 14:15:21 2015 +0100

    Imported Upstream version 1.3.11.1+dfsg
---
 CMakeLists.txt                      |   11 +-
 Documents/iqtree-manual-1.0.pdf     |  Bin 158099 -> 0 bytes
 examples/example.phy                |   46 -
 iqtree.cpp                          |    3 +-
 phylotree.h                         |    2 +-
 pll/CMakeLists.txt                  |   67 +
 pll/alignment.c                     |  754 +++
 pll/avxLikelihood.c                 | 4111 ++++++++++++++++
 pll/bipartitionList.c               |  434 ++
 pll/cycle.h                         |  516 +++
 pll/errcodes.h                      |   69 +
 pll/evaluateGenericSpecial.c        | 3321 +++++++++++++
 pll/evaluatePartialGenericSpecial.c | 1378 ++++++
 pll/fastDNAparsimony.c              | 1942 ++++++++
 pll/genericParallelization.c        | 2283 +++++++++
 pll/genericParallelization.h        |  127 +
 pll/globalVariables.h               |  170 +
 pll/hardware.c                      |  165 +
 pll/hardware.h                      |   48 +
 pll/hash.c                          |  219 +
 pll/hash.h                          |   50 +
 pll/lexer.c                         |  299 ++
 pll/lexer.h                         |   88 +
 pll/makenewzGenericSpecial.c        | 3145 +++++++++++++
 pll/mem_alloc.c                     |  228 +
 pll/mem_alloc.h                     |   77 +
 pll/mic_native.h                    |   56 +
 pll/mic_native_aa.c                 | 1254 +++++
 pll/mic_native_dna.c                |  676 +++
 pll/models.c                        | 4377 ++++++++++++++++++
 pll/newick.c                        |  583 +++
 pll/newick.h                        |   61 +
 pll/newviewGenericSpecial.c         | 8736 +++++++++++++++++++++++++++++++++++
 pll/optimizeModel.c                 | 3149 +++++++++++++
 pll/parsePartition.c                |  388 ++
 pll/parsePartition.h                |   51 +
 pll/parsimony.c                     |  865 ++++
 pll/pll.h                           | 1692 +++++++
 pll/pllInternal.h                   |  313 ++
 pll/pthread.h                       | 1368 ++++++
 pll/queue.c                         |   96 +
 pll/queue.h                         |   48 +
 pll/randomTree.c                    |  177 +
 pll/recom.c                         |  689 +++
 pll/restartHashTable.c              |  357 ++
 pll/sched.h                         |  183 +
 pll/searchAlgo.c                    | 3310 +++++++++++++
 pll/semaphore.h                     |  169 +
 pll/ssort.c                         |  121 +
 pll/stack.c                         |   85 +
 pll/stack.h                         |   48 +
 pll/topologies.c                    |  778 ++++
 pll/trash.c                         |  129 +
 pll/treeIO.c                        |  236 +
 pll/treeIO.h                        |   23 +
 pll/utils.c                         | 3735 +++++++++++++++
 pllnni.h                            |    2 +-
 57 files changed, 53252 insertions(+), 56 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d93cb1..efda1d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,7 +45,7 @@ add_definitions(-DIQ_TREE)
 # The version number.
 set (iqtree_VERSION_MAJOR 1)
 set (iqtree_VERSION_MINOR 3)
-set (iqtree_VERSION_PATCH 11) 
+set (iqtree_VERSION_PATCH "11.1") 
 
 set(BUILD_SHARED_LIBS OFF)
 
@@ -193,11 +193,11 @@ if (IQTREE_FLAGS MATCHES "omp")
 	
 	if (VCC) 
   		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
-  		include_directories("${PROJECT_SOURCE_DIR}/pllrepo/src") # for PThreads headers 
+  		include_directories("${PROJECT_SOURCE_DIR}/pll") # for PThreads headers 
 	elseif (ICC)
   		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qopenmp")
   		if (WIN32)
-  			include_directories("${PROJECT_SOURCE_DIR}/pllrepo/src") # for PThreads headers
+  			include_directories("${PROJECT_SOURCE_DIR}/pll") # for PThreads headers
   		endif() 
   	elseif (GCC)
 		set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
@@ -334,7 +334,7 @@ include_directories("${PROJECT_BINARY_DIR}")
 ##################################################################
 # subdirectories containing necessary libraries for the build 
 ##################################################################
-add_subdirectory(pllrepo/src)
+add_subdirectory(pll)
 add_subdirectory(ncl)
 add_subdirectory(lbfgsb)
 add_subdirectory(whtest)
@@ -518,7 +518,6 @@ install (TARGETS iqtree DESTINATION bin)
 install (FILES "${PROJECT_SOURCE_DIR}/example/models.nex" DESTINATION .)
 install (FILES "${PROJECT_SOURCE_DIR}/example/example.phy" DESTINATION .)
 install (FILES "${PROJECT_SOURCE_DIR}/example/example.nex" DESTINATION .)
-install (FILES "${PROJECT_SOURCE_DIR}/Documents/iqtree-manual-1.0.pdf" DESTINATION .)
 
 if (WIN32)
 	install (FILES "${BINARY_DIR}/iqtree${EXE_SUFFIX}-click.exe" DESTINATION bin)
@@ -564,7 +563,7 @@ endif()
 #set(CPACK_SOURCE_PACKAGE_FILE_NAME
 #  "${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}")
 set(CPACK_SOURCE_IGNORE_FILES
-  "/build.*/;/debug.*/;/examples/;/test_scripts/;/manual/;/.bzr/;~$;/\\\\.svn/;/\\\\.git/;/pll/;/pllrepo.dox/;/pllrepo.examples/;/pllrepo.figures/;/pllrepo.legacy/;/pllrepo.m4/;/pllrepo.man/;/pllrepo.MPI/;/pllrepo.sMSA/;/pllrepo.testdata/;${CPACK_SOURCE_IGNORE_FILES}")
+  "/build.*/;/debug.*/;/examples/;/test_scripts/;/manual/;/.bzr/;~$;/\\\\.svn/;/\\\\.git/;/pllrepo/;${CPACK_SOURCE_IGNORE_FILES}")
 
 set (SYSTEM_NAME "${CMAKE_SYSTEM_NAME}")
 if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")
diff --git a/Documents/iqtree-manual-1.0.pdf b/Documents/iqtree-manual-1.0.pdf
deleted file mode 100644
index 1a6e27a..0000000
Binary files a/Documents/iqtree-manual-1.0.pdf and /dev/null differ
diff --git a/examples/example.phy b/examples/example.phy
deleted file mode 100644
index 8637b06..0000000
--- a/examples/example.phy
+++ /dev/null
@@ -1,46 +0,0 @@
- 44 384 
-FL-1-103     atgcgcatcacccaaggc---------------------accttctccttcctgcccgacctcacggcggcccaggtcaaggcccagatccagtatgcgctggaccagaactgggcggtctcggtggagtacacggacgatccc------------------------------------------------------catccccggaacacctattgggagatgtggggcctgcccatgttcgacctgcgcgatgccgccggcgtctatggcgaggtcgaggcctgccgcaccgcccatcccggcaagtatgtgcgggtgaacgccttcgactccaatcgcgggtgggagacggtgcgcctctccttcatcgtccagcgtccg
-OSH-1-103    atgcgcatcacccaaggc---------------------tgcttctcgttcctgcccgacctgaccgacgagcagatctcggcgcaggtggactattgcctcggccgcggctgggccgtgagcctcgaacataccgacgacccg------------------------------------------------------catccccggaacacctactgggaaatgtggggcatgccgatgttcgacctgcgcgaccccaagggcgtgatgatcgagctggacgagtgccgcaaggcctggcccggccgctacatccgcatcaatgccttcgattccacccgcggcttcgagacggtcacgatgagcttcatcgtcaaccgcccc
-CEU-1-103    atgcgcatcactcaaggc---------------------actttttccttcctgcccgaactgaccgacgagcagatcaccaaacagctcgaatactgcctgaaccagggctgggcggtcggcctcgaatacaccgacgacccg------------------------------------------------------cacccgcgcaacacgtactgggagatgttcgggctgccgatgttcgacctgcgcgatgccgccggcatcctgatggaaatcaacaacgcgcggaacaccttccccaaccactacatccgcgtcacggccttcgattcgacgcatacggtggagtcggtggtgatgtcgttcatcgtcaatcgtccc
-TH-1-103     atgagacttacacaaggc---------------------gcattttcgttcttacctgacttaacagatgagcaaatcgtaaaacaaattcaatatgctatcagcaaaaactgggctttaaacgttgaatggacagatgatccg------------------------------------------------------caccctcgcaacgcatactgggatttatggggattaccattatttggtattaaagatccagcggctgtaatgtttgaaatcaatgcttgccgtaaagctaaaccagcttgttacgtaaaagtaaatgcgtttgataactcacgtggtgtagaaagctgctgcttatcttttatcgttcaacgtcct
-CAa1-103     atgaaactaacacaagga---------------------gctttctcatttcttcctgacttaactgatgcgcaagtaactaagcaaatccagtacgctttaaataagagttgggctatttcgattgaatatactgatgatccg------------------------------------------------------cacccacgtaacagttactgggagatgtggggccttcctctattcgatgttaaggatccagctgcgattcttttcgaaatcaacatggctcgtaaggctaagcctaactactaccttaaaatagcttgttttgataacacacgtggtatcgaaagttgtgtactttctttcattgtacaacgtcct
-CAb1-103     gtgagagttacacaagga---------------------acattttcttttctaccagacctgacaaatgatcaaatcagaaaacaaattcaatatgccataaataaaggatgggcattgagtgtagaatatacagatgaccct------------------------------------------------------cacccacggaattcttactgggaaatgtggggactgcctttatttgatgtcaaagaccctgcggcaattatgtttgaagttgaagcttgtcgaaaagagaaaagcaactattatattaagctattagcttttgattcaaccaaaggagttgaaagtacagcaatgtcctttatggtcaataggcct
-SI-1-103     atgagagttacacaagga---------------------tgtttttcgtttttaccagatttaagtgatgatcaaattaaacaacaagtttcttacgctatgagcaaaggttgggcggttagtgtagaatggacagatgatcca------------------------------------------------------catccacgtaactcatattgggaattatggggtcttcctttatttgatgttaaagatccagctgcagttatgtatgaacttgctgaatgtagaaaagttaacccagaaggttatattaaaattaatgctttcgatgctagtattggtacagaaagttgtgtaatgtcttttattgtacaacgtcct
-LU-1-103     gtgagacttacacaagga---------------------gctttttcttatttaccagatttaactgatgcacaaatcatcaaacaaattgactactgcttaagcagaggttggtctgttggtgttgaatggactgatgatcca------------------------------------------------------cacccacgtaacgcttactgggaactatggggtcttccattatttgacgtaaaagattcttcagcaattttatacgaagttaatgaatgtcgtcgtttaaaccctgaaggttacattaaattagttgctttcaacgcagcacgtggtactgaaagtagtgcatctgcttttattgtacaacgtcca
-SU-1-103     gtgagaataactcaaggt---------------------accttttcttttttgccggacttgactgatgaacaaatcaaaaaacaaattgattatatgatatctaaaaaattagctataggtattgaatatactaacgacata------------------------------------------------------catcctagaaattcattttgggaaatgtggggattacctctatttgaggtcacagatccagctccagtattatttgaaattaatgcttgtcgtaaagcaaaaagtaatttctatatcaaggtagtaggattttcttctgaaagaggtatagaaagtacaataatttcatttattgtaaatagacca
-RP-56-175    atgcaggtgtggccaccagttggcaagaagaagtttgagaccctttcataccttccacccctcactgatgagcaattgcttaaggaagtagagtatcttctaaggaagggatgggttccatgtgttgaatttgagttggagaaa------------------ggatttgtccaccgtcagtacaacagttcaccaggatactatgatggacgttactggacaatgtggaggttgccattgtttggaaccactgatgctgctcaggtgttgaaggaagttgctgaatgtaaagcagaatacccagaagctttcatccgtatcatcggatttgacaacgttcgt------caagtgcaatgcattagtttcattgcaagcacaccc
-A-14-133     atgcaggtgtggcctccaattggaaagaagaagtttgagactctttcctatttgccaccattgacgagagatcaattgttgaaagaagttgaataccttctgaggaagggatgggttccatgcttggaatttgagttgctcaaa------------------ggatttgtgtacggtgagcacaacaagtcaccaagatactatgatggaagatactggacaatgtggaagcttcctatgtttggcaccactgatcctgctcaagtcgtgaaggaggttgatgaagttgttgccgcttaccccgaagctttcgttcgtgtcatcggtttcaacaacgttcgt------caagttcaatgcatcagtttcattgcacacacacca
-PR-57-176    atgcaggtgtggccaccacgtaatttgaagaagtttgagaccctatcataccttccaactctttccgaggagtcattgttgaaggagatcaactaccttctaatcaagggatgggttccttgccttgagttcgaagttggaccg------------------gcacatgtataccgtgagaacaacaagtcaccaggatactatgacggaaggtactggacaatgtggaagctacccatgttcggatgcactgacgcatcccaagttgcagctgaggtggtcgagtgcaagaacgcttaccctgatgcccacgtcagaatcattggattcgacaacaagcgt------caagtccagtgcatcagtttcattgcctacaaacct
-PY-61-180    atgcaggtgtggcctccactcggactgaagaagttcgagaccctctcttaccttcctcccctttcttccgagtccttggccaaggaagttgactacctcctccgcaagaactgggttccctgcttggaatttgagttggagact------------------ggattcgtgtaccgtgagaaccacaggtccccaggatactatgatggaaggtactggacaatgtggaagctgcccatgttcggatgcaccgactcttcccaggtgttgaaggagctggaagaggccaagaaggcttacccccagtccttcatccgtatcatcggattcgacaatgtccgt------caagtgcagtgcatcagtttcatcgcttacaagcct
-MGI-58-176   atgcaggtgtggccgccggagggcctgaagaagttcgagaccctctcctacctcccccctctctccgtcgaggacctcgccaaggaggtggactacctcctccgcaacgactgggttccctgcatcgagttctccaaggaa---------------------gggttcgtgtaccgcgagaaccacgcgtcgcccgggtactacgacgggcggtactggacgatgtggaagctgcccatgttcggctgcaccgacgccagccaggtgatcgccgaggtggaggaggccaagaaggcctaccccgagtacttcgtcagaatcatcggcttcgacaacaagcgc------caagtccagtgcatcagcttcatcgcctacaagccc
-SCR-58-177   tgcatggtgtggccaccactaggaatgaagaagtttgagactctgtcttacctgccccctctatccgaagagtcattgttgaaggaggtccaataccttctcaacaatggatgggttccctgcttggaattcgagcccactcac------------------ggatttgtgtaccgtgagcacggaaacacaccaggatactacgatggacgttactggacaatgtggaagttgcccatgttcggttgcactgacccatcccaggttgttgctgagctcgaggaggccaagaaggcttaccctgaggccttcatccgtatcataggattcgacaacgtgcgt------caagtccagtgtgtcagtttcatcgcctacaagccc
-SA-60-179    atgaaggtgtggccaccacttggattgaggaagttcgagactctttcttacctgcctgatatgagtaacgaacaattgtcaaaggaatgtgactaccttctcaggaatggatgggttccctgcgttgaattcgacatcggaagc------------------ggattcgtgtaccgtgagaaccacaggtcaccaggattctacgatggacgttactggaccatgtggaagctccctatgtttggctgcaccgactcatctcaggtgattcaggagattgaggaggctaagaaggaataccccgacgcattcatcagggttattggctttgacaacgtccgt------caagtccagtgcatcagtttcatcgcctacaagccc
-BR-60-179    atgcaggtatggccaccacgtgggaagaagttctacgagactctctcataccttccaccccttacaagggagcaattggccaaggaagttgaataccttcttcgcaagggatgggttccttgcttggaattcgagttggagcat------------------ggaaccgtgtaccgtgagtaccacagatcaccagggtactatgatggtcgttactggaccatgtggaagctgcccatgtttggttgcacagatgcagtgcaggtgttgcaggagcttgatgagatgattaaagcttacccagattgctatggtaggatcattggtttcgacaatgttcgc------caagtccagtgcattagtttccttgcctacaagcct
-CPL-58-177   atgcaggtgtggccaccaattaacaagaagaagtacgagactctctcatacctccctgatttgagccaagagcaattgcttagcgaaattgagtaccttttgaaaagtggatgggttccttgcttggaattcgaaactgagcgc------------------ggatttgtctaccgtgaacaccaccattcaccaggatactatgacggcaggtactggaccatgtggaagctacctatgttcggatgcactgatgccacccaagtgttggctgaggtggaagaggcgaagaaggcatacccacaggcctgggtccgtattattggattcgacaacgtgcgt------caagtgcagtgcatcagtttcattgcctacaagcca
-LTU-59-178   atgcaggtgtggccaccaattaacatgaagaaatacgagacattgtcataccttcctgacttgtccgatgagcaattgctcaaggaagttgagtaccttttgaaaaatggatgggttccttgcttggaattcgagactgagcac------------------ggatttgtgtaccgtgagcacaacagctcaccaggatactacgatggtagatactggaccatgtggaagttgcctatgtttgggtgcactgacggaacccaggtgttggctgaggttcaagaggccaagaatgcgtacccacaggcctggatccgtattatcggattcgacaacgttcgt------caagtgcagtgcatcagtttcattgcctacaagcca
-TSP-58-177   atgcaggtgtggcccccatatggcaagaagaagtacgagactctctcataccttcctgatttaaccgacgagcaattgctcaaggagattgagtaccttttgaacaagggatgggttccttgcttggaatttgagactgagcac------------------ggatttgtctaccgtgaataccacgcctcacctagatactatgatggaaggtactggaccatgtggaagttgcccatgtttgggtgcactgatgcaactcaggtgttgggtgagctccaagaggccaagaaggcttaccctaatgcatggatcagaatcatcggattcgacaacgtccgt------caagtgcaatgcatcagtttcattgcctacaagcca
-YBN-56-175   atgcaggtgtggccaccagttggcaagaagaagtttgagactctttcctacctgccagaccttgatgatgcacaattggcaaaggaagtagaataccttcttaggaagggatggattccttgcttggaattcgagttggagcac------------------ggtttcgtgtaccgtgagcacaacaggtcactaggatactacgatggacgctactggaccatgtggaagctgcctatgtttggttgcactgatgcttctcaggtgttgaaggagcttcaagaggctaagactgcataccccaacggcttcatccgtatcatcggattcgacaacgttcgc------caagtgcagtgcatcagcttcatcgcctacaagccc
-AN-56-175    atgaaggtgtggccaccacttggattgaagaagtacgagactctctcatacttaccaccactaactgaaactcagttggctaaggaagtcgactacttgctccgcaaaaaatgggttccttgtttggaattcgagttggagcac------------------ggttttgtctaccgtgagaacgccagatcccccggatactatgacggaagatactggacaatgtggaaattgcctatgttcggttgcaccgactcagcccaagtgatgaaggagcttgctgaatgcaagaaggagtacccccaggcctggatccgtatcatcggatttgacaatgttcgt------caagttcaatgtatcatgttcattgcttccaggcca
-HI-60-179    atgcaggtgtggcctcctcttgggaagaagaagttcgagacactctcatacctccccgatcttacacccgtacagttggctaaggaagtagattaccttcttcgctctaaatggattccttgcttggaattcgaattagaggag------------------ggattcgtgcaccgtaagtactcgagcttacccacgtactacgatggacgctactggaccatgtggaaactgcccatgtttgggtgcactgactcggctcaggtgttggaggagcttgagaattgcaagaaggaataccccaatgcattcattagaatcattgggttcgacaacgttcgt------caagtgcagtgcattagtttcattgcctacaagcct
-ANA-56-175   atgaaggtgtggccaccagttggaaagaagaagtttgagaccctctcttaccttcctgaccttaccgaagttgaattgggtaaggaagtcgactaccttctccgcaacaagtggattccttgtgttgaattcgagttggagcac------------------gggtttgtttaccgtgagcacggaagcacccccggatactacgatggccgttactggacaatgtggaagcttcccttgttcggatgcactgactctgctcaagtgttgaaggaagtccaagaatgcaaaacggagtaccctaacgctttcatcaggatcatcggattcgacaacaaccgt------caggtccagtgcatcagtttcatcgcctacaagcca
-ZE-48-166    atgcaggtgtggccggcctacggcaacaagaagttcgagacgctgtcgtacctgccgccgctgtcgacggacgacctgctgaagcaggtggactacctgctgcgcaacggctggataccctgcctcgagttcagcaaggtc---------------------ggcttcgtgtaccgcgagaactccacctccccgtgctactacgacggccgctactggaccatgtggaagctgcccatgttcggctgcaacgacgccacccaggtgtacaaggagctgcaggaggccatcaaatcctacccggacgccttccaccgcgtcatcggcttcgacaacatcaag------cagacgcagtgcgtcagcttcatcgcctacaagccc
-EAT-48-166   atgcaggtgtggccaattgagggcatcaagaagttcgagaccctgtcttacttgccacccctctccacggaggccctcttgaagcaggtcgactacttgatccgctccaagtgggtgccctgcctcgagttcagcaaggtt---------------------ggcttcgtcttccgtgagcacaacagctcccccgggtactacgacggtcgatactggacaatgtggaagctgcctatgttcgggtgcaccgacgccacacaggtgctcaacgaggtggaggaggttaagaaggagtaccctgatgcgtatgtccgcgtcatcggtttcgacaacatgcgc------caggtgcaatgcgtcagcttcattgccttcaggcca
-YSA-46-164   atgcaggtgtggccgattgagggcatcaagaagttcgagaccctctcctacctgccaccgctcaccgtggaggacctcctgaagcagatcgagtacctagctccgttccaagtggtgccctgcctcgagttcagcaaggtc---------------------ggatttgtctaccgtgagaaccacaagtcccctggatactacgacggcaggtactggaccatgtggaagctgcccatgttcgggtgcaccgacgccacccaggtcgtcaaggagctcgaggaggccaagaaggcgtaccctgatgcattcgtccgtatcatcggcttcgacaacgttagg------caggtgcagctcatcagcttcatcgcctacaacccg
-TH-52-170    atgcaggtgtggcctccattcggaaaccccaagtttgagactctgtcctacctccctacgctaaccgaggagcagctggtgaaggaggttgagtacttgttgaggaacaagtgggtgccttgtctagagtttgatctggaa---------------------ggatccatctcgaggaagtataataggagcccggggtactacgatgggagatactgggtgatgtggaagttgccgatgtttgggtgcacagaggcatctcaggtgataaacgaggtgagagagtgtgccaaggcataccccaaagccttcatccgtgtcattggctttgacaacgtccgc------caagtgcagtgcatctccttcatcgtccacaagccc
-LA-68-186    atgcaggtgtggcctccttacgcgaataaaaagtttgagactctgtcgtatctccctcgcttgaccccggagcaactggtgaaggaggtggagtacctgctgaagaacaagtgggtgccctgcctggaattcgaggaggat---------------------ggtgaaataaagagagtgtatgggaatagcccagggtactacgacgggagatactgggtgatgtggaagctgcctatgttcggatgcacagaggcatcgcaggtgttgaacgaggtgaacgagtgtgcgaaggcataccccaacgccttcatccgcgtcatcggattcgacaacgtccgc------caagtgcagtgcatctccttcatcgtccacaagcct
-GR-854-978   atgaaggtgtggaaccccgtcaacaacaagaagttcgagaccttctcctacctgccccccctgtctgacgcccagatcgccaagcaggtggacatgatcattgccaaggggctctccccctgcctggagttcgccgccccggagaacagcttcatcgccaatgacaacactgtgcgcttcagcggcaccgctgcaggctactatgacaaccggtactggaccatgtggaagctgcccatgttcggctgcacggacgccagccaggtgctgcgtgagatctccgagtgccgcagggcctacccccagtgctacgtccgc---ctggccttcgactccgtcaag------caggtgcaggtgatctcgttcgtggtgcagcgcccc
-MO-29-154    ttcaaggtctggcagcccgtgaacaacaagcagtacgagaccttctcctacctgccccccctgaccaaccagaagatcggccgtcaggtcgactacatcatcaacaacggctggaccccctgcttggagttcgctgacccctccacctccttcgtcagcaacgcgaacgccgtgcgcctccagggtgtctccgctggctactacgacaacaggtactggaccatgtggaagctgcccatgttcggctgcactgaccccagccaggtgctgcgcgaggtgtccgcctgccaggtggccttccccaacgtgtacatccgcctggttgccttcgacaacgtcaag------caggtgcagtgcatgggcttcctagtgcagcgcccc
-OE-36-161    atgatggtatggtagccctttaacaataagttctttgagaccttctcgtacttgccccctctcactgacgaccaaatcaccaagcaagtggactacatcttgagaaacaattggactccttgtctggagtttgcgggatccgaccaagcgtatgtgacccacgacaacacggtaagaatgggagattgtgcatccacttatcaggacaacagatattggaccatgtggaaattgcctatgttcggttgcattgatggatcgcaagtgttgaccgaaatttcagcttgcactaaggcctttcctgatgcctacatccgtttggtgtgttttgatgcaaatagg------caagtccaaatttccggctttttggtacataggccc
-EME-43-168   atgatggtttggtagcccttcaacaacaaaatgtttgaaactttttccttcttgcctcccttgactgatgaacaaattagcaaacaagtggactacatcttggccaactcctggaccccctgtcttgaatttgcagcttctgatcaagcttatgctggcaatgaaaattgcatcagaatgggacctgtggcttctacctaccaagacaatagatattggacaatgtggaagctacctatgtttggatgcacagacggctctcaagtgttgagcgagatccaagcatgcacaaatgctttccccgatgcttacatcagattggtttgttttgacgcaaacaga------taggtgtaaatttctggatttttggtgcacagacct
-LRE-46-171   atgatggtctggaccccggtcaacaacaagatgttcgagaccttctcctacctgccccccctgagcgacgagcagatcgccgcccaggtcgactacattgtcgccaacggctggatcccctgcctggagttcgctgagtcggacaaggcctacgtgtccaacgagtcggccatccgcttcggcagcgtgtcttgcctgtactacgacaaccgctactggaccatgtggaagctgcccatgttcggctgccgcgaccccatgcaggtgctgcgcgagatcgtcgcctgcaccaaggccttccccgatgcctacgtgcgcctggtggccttcgacaaccagaag------caggtgcagatcatgggcttcctggtccagcgcccc
-P6-2-107     atgaaaactctgcccaaa------gagcgtcgtttcgagactttctcgtacctgcctcccctcagcgatcgccaaatcgctgcacaaatcgagtacatgatcgagcaaggcttccaccccttgatcgagttcaacgagcac------------------------------------------------------tcgaatccggaagagttctactggacgatgtggaagctccccctgtttgactgcaagagccctcagcaagtcctcgatgaagtgcgtgagtgccgcagcgaatacggtgattgctacatccgtgtcgctggcttcgacaacatcaag------cagtgccaaaccgtgagcttcatcgttcatcgtccc
-HO-1-106     atgaaaactctgcccaaa------gagcgtcgctacgaaaccctttcctacctgccccccctgagcgatcagcaaattgctcgccagattgagtacatggtgcgcgaaggctatattcccgccgtggaattcaacgaagat------------------------------------------------------tccgacgcgaccacctgctactggaccatgtggaagttgcccctgttccacgccacttctacccaagaagtgttgggcgaagtgcgcgagtgccgcaccgaataccccaactgctacatccgcgtagttggtttcgacaacatcaag------cagtgtcagtccgtgagcttcatcgttcacaagccc
-SP-1-106     atgcaaaccttaccaaaa------gagcgtcgttacgaaaccctttcttacttaccccccctcaccgacgttcaaatcgaaaagcaagtccagtacattctgagccaaggctacattccagccgttgagttcaacgaagtt------------------------------------------------------tctgaacctaccgaactttattggacactgtggaagctacctttgtttggtgctaaaacatcccgtgaagtattggcagaagttcaatcttgccgttctcaatatcctggtcactacatccgtgttgtaggatttgacaatattaag------cagtgccaaatcctgagcttcatcgttcacaaaccc
-PA-1-105     ---atgcaacttagagta------gaacgtaagttcgaaactttttcttatttaccaccattaaacgaccaacagattgcgcgtcaattacaatacgcactttccaatggttatagcccagcaatcgaattcagttttaca------------------------------------------------------ggtaaagctgaagacttagtatggactttatggaaattacctttatttggtgcacaatctcctgaagaagtacttagcgaaattcaagcttgtaaacaacagttccctaatgcttacattcgtgttgtagcatttgactctatcaga------caagttcaaactttaatgttcttagtttacaaacca
-NE-2-109     gctgaaatgcaggattacaagcaaagcctcaaatatgagactttctcttatcttccacccatgaacgcggaacgcatccgcgctcaaatcaagtacgcaattgctcaaggctggagccccggcattgagcacgtagaagtgaaa------------------------------------------------------aactccatgaaccaatattggtacatgtggaaacttcccttcttcggcgaacaaaatgtcgacaacgtgttggctgaaattgaagcgtgtcgtagtgcgtatccaacacaccaggtcaaactggtggcttatgacaactatgcg------caaagcttaggtctggccttcgtggtctaccgcggc
-IFE-2-109    gctgacattcaggactacaactcaacacccaagtacgaaaccttctcttatttgccggcaatgggaccggaaaaaatgcgccgtcagatcgcctatctcatcaatcagggctggaaccccggcatcgagcatgtggaacctgaa------------------------------------------------------cgcgcatcaacatactactggtacatgtggaagttacccatgttcggcgaacagtcggtggacaccgtgatcatggagttggaagcatgccatcgcgctcaccccggccatcacgtgcgcttggtcgggtatgacaattactcg------cagagccagggcagcgcttttgtggtgtttcgcggg
-HS-9-115     ---tcgagcgtcagcgatccgtcgagccgcaagttcgagaccttctcctacctgcccgaactcggcgtggaaaagatccgcaagcaggtcgagtacatcgtcagcaagggctggaacccggccgtcgagcacaccgagccggag------------------------------------------------------aacgccttcgaccactactggtacatgtggaagctgccgatgttcggcgaaaccgacgtggacgccatcctggccgaggccgaggcatgccacaaggcgcatccctcgcatcacgtgcgcctgatcggctacgacaactatgcc------cagtcgcaaggcactgccatggtgatcttccgcggc
-RVI-7-114    agttccagcctcgaagacgtcaacagccgcaagttcgagaccttctcctacctgccgcgcatggatgccgaccgcatccgcaagcaggtcgagtacatcgtctccaagggctggaacccggccatcgagcacaccgagccggaa------------------------------------------------------aacgccttcgatcactactggtacatgtggaagctgccgatgttcggcgagaccgacatcgacaccatcctcaaggaggccgaagcctgccacaaggcgcaccccaacaatcacgtgcgtctgatcggcttcgacaactatgcc------cagtccaagggcgccgagatggtggtctatcgcggc
-IFE-8-115    aaatcccgtctctccgacccggcgagcgcgaagttcgagacactgtcttacctgcccgccctgaccgcggacgagatccgtcaacaggttgcgtatattgtttccaagggctggaatccggcggtagaacataccgaaccggaa------------------------------------------------------aacgccttcggcaactactggtatatgtggaagttgcccatgttcggcgaaacggacgtggacaccattctgaaagaagcggaacgctgccataagcggaatccccataaccacgtccgtatcgtcggctatgataacttcaag------cagtcccagggtacttccctggtagtctatcggggc
-RVI-5-112    agcagcatgggcgatcacgccaccatcggccgctacgagaccttttcctatctgccgccgctcaaccgcgaggagatcctggagcagatcctctacatcctcgacaacggctggaacgcctcgctggagcacgagcatccggat------------------------------------------------------cgcgccttcgagtattactggccgatgtggaagatgcccttcttcggcgaacaggatccgaacgtgatcctgaccgagatcgagtcctgccggcgcagctatccggaccatcacgtccggctggtcggctacgacacctacgcc------cagagcaagggacattccttcctggcgcaccgcccg
-
diff --git a/iqtree.cpp b/iqtree.cpp
index 6678274..19ca598 100644
--- a/iqtree.cpp
+++ b/iqtree.cpp
@@ -25,8 +25,7 @@
 #include "model/modelgtr.h"
 #include "model/rategamma.h"
 #include <numeric>
-#include "pllrepo/src/pllInternal.h"
-#include "pllrepo/src/pll.h"
+#include "pll/pllInternal.h"
 #include "pllnni.h"
 #include "vectorclass/vectorclass.h"
 #include "vectorclass/vectormath_common.h"
diff --git a/phylotree.h b/phylotree.h
index ddf7f28..afee186 100644
--- a/phylotree.h
+++ b/phylotree.h
@@ -30,7 +30,7 @@
 #include "phylonode.h"
 #include "optimization.h"
 #include "model/rateheterogeneity.h"
-#include "pllrepo/src/pll.h"
+#include "pll/pll.h"
 
 #define BOOT_VAL_FLOAT
 #define BootValType float
diff --git a/pll/CMakeLists.txt b/pll/CMakeLists.txt
new file mode 100644
index 0000000..fd99063
--- /dev/null
+++ b/pll/CMakeLists.txt
@@ -0,0 +1,67 @@
+#set( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -Wall -msse3 -DRAXML_USE_LLALLOC -D_USE_PTHREADS -D_OPTIMIZED_FUNCTIONS -D__SIM_SSE3 -fno-builtin" )
+
+#add_executable( raxml_light axml.c  optimizeModel.c trash.c searchAlgo.c topologies.c treeIO.c models.c evaluatePartialGenericSpecial.c evaluateGenericSpecial.c newviewGenericSpecial.c makenewzGenericSpecial.c bipartitionList.c restartHashTable.c fastDNAparsimony.c randomTree.c lockless_allocator/ll_alloc.c mem_alloc.c recom.c)
+
+#target_link_libraries( raxml_light m pthread )
+
+if (NOT BINARY32 AND NOT IQTREE_FLAGS MATCHES "novx")
+add_library(pllavx 
+	avxLikelihood.c)
+endif()
+
+if (IQTREE_FLAGS MATCHES "omp")
+add_library(pll 
+  alignment.c
+  bipartitionList.c
+  evaluateGenericSpecial.c
+  evaluatePartialGenericSpecial.c
+  fastDNAparsimony.c
+  hardware.c
+  hash.c
+  lexer.c
+  makenewzGenericSpecial.c
+  models.c
+  newick.c
+  newviewGenericSpecial.c
+  genericParallelization.c
+  optimizeModel.c
+  parsePartition.c
+  queue.c
+  randomTree.c
+  recom.c
+  restartHashTable.c
+  searchAlgo.c
+  ssort.c
+  stack.c
+  topologies.c
+  trash.c
+  treeIO.c
+  utils.c)
+else()
+add_library(pll 
+  alignment.c
+  bipartitionList.c
+  evaluateGenericSpecial.c
+  evaluatePartialGenericSpecial.c
+  fastDNAparsimony.c
+  hardware.c
+  hash.c
+  lexer.c
+  makenewzGenericSpecial.c
+  models.c
+  newick.c
+  newviewGenericSpecial.c
+  optimizeModel.c
+  parsePartition.c
+  queue.c
+  randomTree.c
+  recom.c
+  restartHashTable.c
+  searchAlgo.c
+  ssort.c
+  stack.c
+  topologies.c
+  trash.c
+  treeIO.c
+  utils.c)
+endif()
\ No newline at end of file
diff --git a/pll/alignment.c b/pll/alignment.c
new file mode 100644
index 0000000..d50f6db
--- /dev/null
+++ b/pll/alignment.c
@@ -0,0 +1,754 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file alignment.c
+ *
+ * @brief Collection of routines for reading alignments
+ *
+ * Auxiliary functions for storing alignments read from predefined file formats
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+/** @defgroup alignmentGroup Reading and parsing multiple sequence alignments
+    
+    This set of functions handles the reading and parsing of several file formats that describe multiple sequence alignments. They are also responsible for storing the alignment in an internal structure
+*/
+static pllAlignmentData * pllParsePHYLIP (const char * filename);
+static pllAlignmentData * pllParseFASTA (const char * filename);
+static int read_phylip_header (int * inp, int * sequenceCount, int * sequenceLength);
+static __inline int parsedOk (int * actLen, int sequenceCount, int sequenceLength);
+static int parse_phylip (pllAlignmentData * alignmentData, int input);
+static int getFastaAlignmentInfo (int * inp, int * seqCount, int * seqLen);
+static int parseFastaAlignment (pllAlignmentData * alignmentData, int input);
+
+#ifdef __PLL_DEBUG_PARSER
+static int
+printTokens (int input)
+{
+  pllLexToken token;
+
+  do
+   {
+     NEXT_TOKEN
+
+     /* begin of parser */
+     switch (token.tokenType)
+      {
+        case PLL_TOKEN_NUMBER:
+          printf ("PLL_TOKEN_NUMBER (%.*s, %d)\n", token.len, token.lexeme, token.len);
+          break;
+        case PLL_TOKEN_STRING:
+          printf ("PLL_TOKEN_STRING (%.*s, %d)\n", token.len, token.lexeme, token.len);
+          break;
+        case PLL_TOKEN_EOF:
+          printf ("PLL_TOKEN_EOF\n");
+          break;
+        case PLL_TOKEN_WHITESPACE:
+          printf ("PLL_TOKEN_WHITESPACE\n");
+          break;
+        case PLL_TOKEN_NEWLINE:
+          printf ("PLL_TOKEN_NEWLINE\n");
+          break;
+        case PLL_TOKEN_UNKNOWN:
+          printf ("PLL_TOKEN_UNKNOWN (%.*s, %d)\n", token.len, token.lexeme, token.len);
+          break;
+        default:
+          break;
+      }
+     /* end of parser */
+
+
+   }
+  while (token.tokenType != PLL_TOKEN_EOF && token.tokenType != PLL_TOKEN_UNKNOWN);
+
+  if (token.tokenType == PLL_TOKEN_UNKNOWN) return (0);
+
+  return (1);
+}
+#endif
+
+/** @ingroup alignmentGroup
+    @brief Initialize alignment structure fields
+
+    Allocates memory for the data structure that will hold the alignment and
+    initializes it. It requires the number of sequences \a sequenceCount and
+    the length of sequences \a sequenceLength. It returns a pointer to the
+    initialized data structure.
+
+    @param sequenceCount
+      Number of sequences in the alignment
+    
+    @param sequenceLength
+      Length of the sequences
+
+    @param 
+      Initialized alignment data structured
+*/
+pllAlignmentData *
+pllInitAlignmentData (int sequenceCount, int sequenceLength)
+ {
+   int i;
+   pllAlignmentData * alignmentData;
+   //void * mem;
+   //TUNG
+   unsigned char *mem;
+
+   
+   /** TODO */
+   alignmentData               =  (pllAlignmentData *) rax_malloc (sizeof (pllAlignmentData));
+   alignmentData->sequenceData = (unsigned char **) rax_malloc ((sequenceCount + 1) * sizeof (unsigned char *));
+   //mem = (void *) rax_malloc (sizeof (unsigned char) * (sequenceLength + 1) * sequenceCount);
+   //TUNG
+   mem = (unsigned char *)rax_malloc(sizeof(unsigned char) * (sequenceLength + 1) * sequenceCount);
+   for (i = 1; i <= sequenceCount; ++i)
+    {
+      alignmentData->sequenceData[i]                 = (unsigned char *) (&mem[sizeof (unsigned char) * (i - 1) * (sequenceLength + 1)]);
+      alignmentData->sequenceData[i][sequenceLength] = 0;
+    }
+   alignmentData->sequenceData[0] = NULL;
+    
+   alignmentData->sequenceLabels = (char **) rax_calloc ((sequenceCount + 1), sizeof (char *));
+
+   alignmentData->sequenceCount  = sequenceCount;
+   alignmentData->sequenceLength = sequenceLength;
+   alignmentData->originalSeqLength = sequenceLength;
+
+   /** TODO: remove siteWeights from alignment */
+   alignmentData->siteWeights    = NULL;
+
+   return (alignmentData);
+ }
+
+/** @ingroup alignmentGroup
+    @brief Deallocates the memory associated with the alignment data structure
+    
+    Deallocates the memory associated with the alignment data structure \a alignmentData.
+
+    @param alignmentData
+      The alignment data structure
+*/
+void
+pllAlignmentDataDestroy (pllAlignmentData * alignmentData)
+{
+  int i;
+
+  for (i = 1; i <= alignmentData->sequenceCount; ++ i)
+   {
+     rax_free (alignmentData->sequenceLabels[i]);
+   }
+  rax_free (alignmentData->sequenceLabels);
+  rax_free (alignmentData->sequenceData[1]);
+  rax_free (alignmentData->sequenceData);
+  rax_free (alignmentData->siteWeights);
+  rax_free (alignmentData);
+}
+
+
+/** @ingroup alignmentGroup
+    @brief Prints the alignment to the console
+
+    @param alignmentData
+      The alignment data structure
+*/
+void 
+pllAlignmentDataDumpConsole (pllAlignmentData * alignmentData)
+ {
+   int i;
+
+   printf ("%d %d\n", alignmentData->sequenceCount, alignmentData->sequenceLength);
+   for (i = 1; i <= alignmentData->sequenceCount; ++ i)
+    {
+      printf ("%s %s\n", alignmentData->sequenceLabels[i], alignmentData->sequenceData[i]);
+    }
+ }
+
+
+
+static void dump_fasta_content(FILE * fp, pllAlignmentData * alignmentData)
+{
+  int i;
+
+  for (i = 1; i <= alignmentData->sequenceCount; ++i)
+     fprintf (fp, ">%s\n%s\n", alignmentData->sequenceLabels[i], alignmentData->sequenceData[i]);
+}
+
+static void dump_phylip_content(FILE * fp, pllAlignmentData * alignmentData)
+{
+  int i;
+
+  for (i = 1; i <= alignmentData->sequenceCount; ++i)
+     fprintf (fp, "%s %s\n", alignmentData->sequenceLabels[i], alignmentData->sequenceData[i]);
+}
+
+/** @ingroup alignmentGroup
+    @brief Dump the alignment to a file of format \a fileFormat
+
+    Dumps the alignment contained in \a alignmentData to file \a filename of type \a fileFormat.
+
+    @note If \a filename exists, all contents will be erased
+
+    @param alignmentData
+      Alignment data structure
+
+    @param fileFormat
+      Format of output file. Can take the value \b PLL_FORMAT_PHYLIP or \b PLL_FORMAT_FASTA
+
+    @param filename
+      Output filename
+
+    @return
+      Returns \b PLL_TRUE on success, otherwise \b PLL_FALSE.
+*/
+int
+pllAlignmentDataDumpFile (pllAlignmentData * alignmentData, int fileFormat, const char * filename)
+{
+  FILE * fp;
+  void (*outfun)(FILE *, pllAlignmentData *);
+  
+  if (fileFormat != PLL_FORMAT_PHYLIP && fileFormat != PLL_FORMAT_FASTA) return (PLL_FALSE);
+
+  outfun = (fileFormat == PLL_FORMAT_PHYLIP) ? dump_phylip_content : dump_fasta_content;
+
+  fp = fopen (filename,"wb");
+  if (!fp) return (PLL_FALSE);
+  
+  /* if PHYLIP print the silly header at the beginning */
+  if (fileFormat == PLL_FORMAT_PHYLIP)
+   {
+     fprintf (fp, "%d %d\n", alignmentData->sequenceCount, alignmentData->sequenceLength);
+   }
+  
+  outfun(fp, alignmentData);
+
+  fclose (fp);
+  return (PLL_TRUE);
+}
+
+
+
+/* ROUTINES FOR PHYLIP PARSING */
+/** @ingroup alignmentGroup
+    @brief Parse the PHYLIP file header
+*/
+static int
+read_phylip_header (int * inp, int * sequenceCount, int * sequenceLength)
+{
+  pllLexToken token;
+  int input;
+
+  input = *inp;
+
+
+  NEXT_TOKEN
+  CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+
+  if (token.tokenType != PLL_TOKEN_NUMBER) return (0);
+
+  *sequenceCount = atoi (token.lexeme);
+
+  NEXT_TOKEN
+  CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+  if (token.tokenType != PLL_TOKEN_NUMBER) return (0);
+
+  *sequenceLength = atoi (token.lexeme);
+
+  *inp = input;
+
+  return (*sequenceCount && *sequenceLength);
+}
+
+static __inline int
+parsedOk (int * actLen, int sequenceCount, int sequenceLength)
+{
+  int i;
+
+  for (i = 1; i <= sequenceCount; ++ i)
+   {
+     if (actLen[i] != sequenceLength) return (0);
+   }
+  
+  return (1);
+}
+
+
+/** @ingroup alignmentGroup
+    @brief Parse the PHYLIP file body
+*/
+static int
+parse_phylip (pllAlignmentData * alignmentData, int input)
+{
+  int i,j;
+  pllLexToken token;
+  int * sequenceLength;
+  int rc;
+
+  sequenceLength = (int *) rax_calloc (alignmentData->sequenceCount + 1, sizeof (int));
+
+  NEXT_TOKEN
+  for (i = 0; ; ++i)
+  {
+    j = i % alignmentData->sequenceCount;
+    if (i < alignmentData->sequenceCount) 
+     {
+       if (token.tokenType == PLL_TOKEN_EOF)
+        {
+          rc = parsedOk (sequenceLength, alignmentData->sequenceCount, alignmentData->sequenceLength);
+          rax_free (sequenceLength);
+          return (rc);
+        }
+
+       if (token.tokenType == PLL_TOKEN_UNKNOWN)
+        {
+          rax_free (sequenceLength);
+          return (0);
+        }
+
+       CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+
+
+       if (token.tokenType != PLL_TOKEN_STRING && token.tokenType != PLL_TOKEN_NUMBER && token.tokenType != PLL_TOKEN_FLOAT)
+        {
+          rax_free (sequenceLength);
+          return (0);
+        }
+       alignmentData->sequenceLabels[i + 1] = my_strndup (token.lexeme, token.len);
+       NEXT_TOKEN
+       CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+     }
+    
+    while (1)
+     {
+       if (token.tokenType == PLL_TOKEN_EOF)
+        {
+          rc = parsedOk (sequenceLength, alignmentData->sequenceCount, alignmentData->sequenceLength);
+          rax_free (sequenceLength);
+          return (rc);
+        }
+
+       if (token.tokenType == PLL_TOKEN_UNKNOWN)
+        {
+         rax_free (sequenceLength);
+         return (0);
+        }
+       
+       if (token.tokenType == PLL_TOKEN_NEWLINE) break;
+
+       if (token.tokenType != PLL_TOKEN_STRING)
+        {
+          rax_free (sequenceLength);
+          return (0);
+        }
+
+       if (sequenceLength[j + 1] + token.len > alignmentData->sequenceLength) 
+        {
+          fprintf (stderr, "Sequence %d is larger than specified\n", j + 1);
+          rax_free (sequenceLength);
+          return (0);
+        }
+       memmove (alignmentData->sequenceData[j + 1] + sequenceLength[j + 1], token.lexeme, token.len);
+       sequenceLength[j + 1] += token.len;
+
+       NEXT_TOKEN
+       CONSUME (PLL_TOKEN_WHITESPACE)
+     }
+    CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE);
+  }
+}
+
+/* Phylip parsers. Use the following attributed grammar 
+ * 
+ *        S -> HEADER ENDL DATA
+ *   HEADER -> PLL_TOKEN_NUMBER PLL_TOKEN_WHITESPACE PLL_TOKEN_NUMBER ENDL |
+ *             PLL_TOKEN_WHITESPACE PLL_TOKEN_NUMBER PLL_TOKEN_WHITESPACE PLL_TOKEN_NUMBER ENDL
+ *     ENDL -> PLL_TOKEN_WHITESPACE PLL_TOKEN_NEWLINE | PLL_TOKEN_NEWLINE
+ *     DATA -> PLL_TOKEN_STRING PLL_TOKEN_WHITESPACE PLL_TOKEN_STRING ENDL DATA |
+ *             PLL_TOKEN_WHITESPACE PLL_TOKEN_STRING PLL_TOKEN_WHITESPACE PLL_TOKEN_STRING ENDL DATA | 
+ *             PLL_TOKEN_STRING PLL_TOKEN_WHITESPACE PLL_TOKEN_STRING PLL_TOKEN_EOF |
+ *             PLL_TOKEN_WHITESPACE PLL_TOKEN_STRING PLL_TOKEN_WHITESPACE PLL_TOKEN_STRING PLL_TOKEN_EOF
+ */
+
+/** @ingroup alignmentGroup
+    @brief Parse a PHYLIP file
+
+    Parses the PHYLIP file \a filename and returns a ::pllAlignmentData structure
+    with the alignment.
+
+    @param filename
+      Name of file to be parsed
+
+    @return
+      Returns a structure of type ::pllAlignmentData that contains the alignment, or \b NULL
+      in case of failure.
+*/
+static pllAlignmentData *
+pllParsePHYLIP (const char * filename)
+{
+  int 
+    i, input, sequenceCount, sequenceLength;
+  char * rawdata;
+  long filesize;
+  pllAlignmentData * alignmentData;
+
+  rawdata = pllReadFile (filename, &filesize);
+  if (!rawdata)
+   {
+     errno = PLL_ERROR_FILE_OPEN;
+     return (NULL);
+   }
+  
+  init_lexan (rawdata, filesize);
+  input = get_next_symbol();
+
+  /* parse the header to obtain the number of taxa and sequence length */
+  if (!read_phylip_header (&input, &sequenceCount, &sequenceLength))
+   {
+     rax_free (rawdata);
+     fprintf (stderr, "Error while parsing PHYLIP header (number of taxa and sequence length)\n");
+     errno = PLL_ERROR_PHYLIP_HEADER_SYNTAX;
+     return (NULL);
+   }
+
+  lex_table_amend_phylip();
+
+  /* allocate alignment structure */
+  alignmentData = pllInitAlignmentData (sequenceCount, sequenceLength);
+
+  if (! parse_phylip (alignmentData, input))
+   {
+     errno = PLL_ERROR_PHYLIP_BODY_SYNTAX;
+     pllAlignmentDataDestroy (alignmentData);
+     lex_table_restore();
+     rax_free (rawdata);
+     return (NULL);
+   }
+  
+  lex_table_restore();
+  rax_free (rawdata);
+
+  alignmentData->siteWeights  = (int *) rax_malloc (alignmentData->sequenceLength * sizeof (int));
+  for (i = 0; i < alignmentData->sequenceLength; ++ i) 
+    alignmentData->siteWeights[i] = 1;
+
+  return (alignmentData);
+}
+
+pllAlignmentData *
+pllParsePHYLIPString (const char *rawdata, long filesize)
+{
+  int
+    i, input, sequenceCount, sequenceLength;
+//  char * rawdata;
+//  long filesize;
+  pllAlignmentData * alignmentData;
+
+//  rawdata = pllReadFile (filename, &filesize);
+//  if (!rawdata)
+//   {
+//     errno = PLL_ERROR_FILE_OPEN;
+//     return (NULL);
+//   }
+
+  init_lexan (rawdata, filesize);
+  input = get_next_symbol();
+
+  /* parse the header to obtain the number of taxa and sequence length */
+  if (!read_phylip_header (&input, &sequenceCount, &sequenceLength))
+   {
+//     rax_free (rawdata);
+     fprintf (stderr, "Error while parsing PHYLIP header (number of taxa and sequence length)\n");
+     errno = PLL_ERROR_PHYLIP_HEADER_SYNTAX;
+     return (NULL);
+   }
+
+  lex_table_amend_phylip();
+
+  /* allocate alignment structure */
+  alignmentData = pllInitAlignmentData (sequenceCount, sequenceLength);
+
+  if (! parse_phylip (alignmentData, input))
+   {
+     errno = PLL_ERROR_PHYLIP_BODY_SYNTAX;
+     pllAlignmentDataDestroy (alignmentData);
+     lex_table_restore();
+//     rax_free (rawdata);
+     return (NULL);
+   }
+
+  lex_table_restore();
+//  rax_free (rawdata);
+
+  alignmentData->siteWeights  = (int *) rax_malloc (alignmentData->sequenceLength * sizeof (int));
+  for (i = 0; i < alignmentData->sequenceLength; ++ i)
+    alignmentData->siteWeights[i] = 1;
+
+  return (alignmentData);
+}
+
+/* FASTA routines */
+/* only check whether it is a valid alignment in fasta format */
+/** @ingroup alignmentGroup
+    @brief Get information about the FASTA alignment
+
+    Get the information such as number of sequences and length of sequences of a FASTA alignment
+
+    @return
+      Returns \b PLL_TRUE if the alignment is valid, otherwise \b PLL_FALSE
+*/
+static int
+getFastaAlignmentInfo (int * inp, int * seqCount, int * seqLen)
+{
+  pllLexToken token;
+  int input;
+
+  input = *inp;
+
+  *seqCount = *seqLen = 0;
+
+  NEXT_TOKEN
+  CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+
+  if (token.tokenType != PLL_TOKEN_NUMBER && token.tokenType != PLL_TOKEN_STRING) return (PLL_FALSE);
+
+  while (1)
+   {
+     switch (token.tokenType)
+      {
+        case PLL_TOKEN_EOF:
+          return (PLL_TRUE);
+
+        case PLL_TOKEN_NUMBER:
+        case PLL_TOKEN_STRING:
+          if (token.len < 2 || token.lexeme[0] != '>') return (0);
+          break;
+        default:
+          return (PLL_FALSE);
+      }
+     
+     NEXT_TOKEN
+     CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+
+     /* read second token (sequence) */
+     switch (token.tokenType)
+      {
+        case PLL_TOKEN_EOF:
+          return (PLL_FALSE);
+          break;
+
+        case PLL_TOKEN_NUMBER:
+        case PLL_TOKEN_STRING:
+          if (!*seqLen)
+            *seqLen = token.len;
+          else
+           {
+             if (*seqLen != token.len) return (0);
+           }
+          break;
+        default:
+          return (PLL_FALSE);
+      }
+     NEXT_TOKEN
+     CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+     ++ (*seqCount);
+   }
+
+  return (PLL_TRUE);
+}
+
+/** @ingroup alignmentGroup
+    @brief Check whether the FASTA content is valid
+*/
+static int
+parseFastaAlignment (pllAlignmentData * alignmentData, int input)
+{
+  pllLexToken token;
+  int i;
+
+  NEXT_TOKEN
+  CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+
+  if (token.tokenType != PLL_TOKEN_NUMBER && token.tokenType != PLL_TOKEN_STRING) return (0);
+
+  i = 1;
+  while (1)
+   {
+     /* first parse the sequence label */
+     switch (token.tokenType)
+      {
+        case PLL_TOKEN_EOF:
+          return (1);
+          break;
+
+        case PLL_TOKEN_NUMBER:
+        case PLL_TOKEN_STRING:
+          alignmentData->sequenceLabels[i] = my_strndup (token.lexeme + 1, token.len - 1);
+          break;
+        default:
+          return (0);
+      }
+     
+     NEXT_TOKEN
+     CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+
+     /* now parse the sequence itself */
+     switch (token.tokenType)
+      {
+        case PLL_TOKEN_EOF:
+          return (0);
+          break;
+
+        case PLL_TOKEN_NUMBER:
+        case PLL_TOKEN_STRING:
+          memmove (alignmentData->sequenceData[i], token.lexeme, token.len);
+          break;
+        default:
+          return (0);
+      }
+     NEXT_TOKEN
+     CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+     ++ i;
+   }
+}
+
+
+/** @ingroup alignmentGroup
+    @brief Parse a FASTA file
+    
+    Parses the FASTA file \a filename and returns a ::pllAlignmentData structure
+    with the alignment.
+
+    @param filename
+      Name of file to be parsed
+
+    @return
+      Returns a structure of type ::pllAlignmentData that contains the alignment, or \b NULL
+      in case of failure.
+*/
+static pllAlignmentData *
+pllParseFASTA (const char * filename)
+{
+  int
+    i,
+    seqLen,
+    seqCount,
+    input;
+  long filesize;
+
+  char * rawdata;
+  pllAlignmentData * alignmentData;
+
+  rawdata = pllReadFile (filename, &filesize);
+  if (!rawdata)
+   {
+     errno = PLL_ERROR_FILE_OPEN;
+     return (NULL);
+   }
+
+  lex_table_amend_fasta ();
+  
+  init_lexan (rawdata, filesize);
+  input = get_next_symbol ();
+
+
+  if (!getFastaAlignmentInfo (&input, &seqCount, &seqLen))
+   {
+     errno = PLL_ERROR_FASTA_SYNTAX;
+     lex_table_restore ();
+     rax_free (rawdata);
+     return (NULL);
+   }
+  
+  alignmentData = pllInitAlignmentData (seqCount, seqLen);
+  
+  printf ("\n---------------\n\n");
+
+  init_lexan (rawdata, filesize);
+  input = get_next_symbol ();
+
+  if (!parseFastaAlignment (alignmentData, input))
+   {
+     errno = PLL_ERROR_FASTA_SYNTAX;
+     pllAlignmentDataDestroy (alignmentData);
+     lex_table_restore();
+     rax_free(rawdata);
+     return (NULL);
+   }
+
+  /* allocate alignment structure */
+
+
+  lex_table_restore ();
+  rax_free (rawdata);
+
+  alignmentData->siteWeights = (int *) rax_malloc (alignmentData->sequenceLength * sizeof (int));
+  for (i = 0; i < alignmentData->sequenceLength; ++ i)
+    alignmentData->siteWeights[i] = 1;
+
+  return (alignmentData);
+}
+
+
+
+/** @ingroup alignmentGroup
+    @brief Parse a file that contains a multiple sequence alignment
+
+    Parses the file \a filename of type \a fileType which contains a multiple sequence alignment.
+    The supported file types are the sequential and interleaved versions of PHYLIP format, and
+    the FASTA format. The parsed alignment is returned as a pointer to a structure of type
+    ::pllAlignmentData
+
+    @param fileType
+      Type of file to parse. Can be either \b PLL_FORMAT_PHYLIP or \b PLL_FORMAT_FASTA
+
+    @param filename
+      Name of file to parse
+
+    @return
+      Returns a structure of type ::pllAlignmentData that contains the multiple sequence alignment,
+      otherwise returns \b NULL in case of failure.
+*/
+pllAlignmentData *
+pllParseAlignmentFile (int fileType, const char * filename)
+{
+
+  switch (fileType)
+   {
+     case PLL_FORMAT_PHYLIP:
+       return (pllParsePHYLIP (filename));
+     case PLL_FORMAT_FASTA:
+       return (pllParseFASTA (filename));
+     default:
+       /* RTFM */
+       errno = PLL_ERROR_INVALID_FILETYPE;
+       return (NULL);
+   }
+}
diff --git a/pll/avxLikelihood.c b/pll/avxLikelihood.c
new file mode 100644
index 0000000..5202883
--- /dev/null
+++ b/pll/avxLikelihood.c
@@ -0,0 +1,4111 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file avxLikelihood.c
+ *
+ * @brief AVX versions of the likelihood functions
+ *
+ * AVX versions of the likelihood functions
+ */
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include <limits.h>
+#include <stdint.h>
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+#include <immintrin.h>
+#include <assert.h>
+
+#ifdef _FMA
+#include <x86intrin.h>
+#define FMAMACC(a,b,c) _mm256_fmadd_pd(b,c,a)
+#endif
+
+#include "pll.h"
+#include "pllInternal.h"
+
+extern const unsigned int mask32[32];
+
+PLL_ALIGN_BEGIN const union PLL_ALIGN_END
+{
+  uint64_t i[4];
+  __m256d m;
+  
+} absMask_AVX = {{0x7fffffffffffffffULL, 0x7fffffffffffffffULL, 0x7fffffffffffffffULL, 0x7fffffffffffffffULL}};
+
+
+
+static __inline __m256d hadd4(__m256d v, __m256d u)
+{ 
+  __m256d
+    a, b;
+  
+  v = _mm256_hadd_pd(v, v);
+  a = _mm256_permute2f128_pd(v, v, 1);
+  v = _mm256_add_pd(a, v);
+
+  u = _mm256_hadd_pd(u, u);
+  b = _mm256_permute2f128_pd(u, u, 1);
+  u = _mm256_add_pd(b, u);
+
+  v = _mm256_mul_pd(v, u);	
+  
+  return v;
+}
+
+static __inline __m256d hadd3(__m256d v)
+{ 
+  __m256d
+    a;
+  
+  v = _mm256_hadd_pd(v, v);
+  a = _mm256_permute2f128_pd(v, v, 1);
+  v = _mm256_add_pd(a, v);
+  
+  return v;
+}
+
+
+void  newviewGTRGAMMA_AVX(int tipCase,
+			 double *x1, double *x2, double *x3,
+			 double *extEV, double *tipVector,
+			 int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+			 const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling
+			 )
+{
+ 
+  int  
+    i, 
+    k, 
+    scale, 
+    addScale = 0;
+ 
+  __m256d 
+    minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD),
+    twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+ 
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+	double 
+	  *uX1, *uX2;
+	PLL_ALIGN_BEGIN double
+	  umpX1[1024] PLL_ALIGN_END,
+	  umpX2[1024] PLL_ALIGN_END;
+
+	for (i = 1; i < 16; i++)
+	  {
+	    __m256d 
+	      tv = _mm256_load_pd(&(tipVector[i * 4]));
+
+	    int 
+	      j;
+	    
+	    for (j = 0; j < 4; j++)
+	      for (k = 0; k < 4; k++)
+		{		 
+		  __m256d 
+		    left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+
+		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = hadd3(left1);
+		  		  		  
+		  _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		}
+	  
+	    for (j = 0; j < 4; j++)
+	      for (k = 0; k < 4; k++)
+		{		 
+		  __m256d 
+		    left1 = _mm256_load_pd(&right[j * 16 + k * 4]);		  		  		  
+
+		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = hadd3(left1);
+		  		  		  
+		  _mm256_store_pd(&umpX2[i * 64 + j * 16 + k * 4], left1);
+		}	    
+	  }   	
+	  
+
+	for(i = 0; i < n; i++)
+	  {	    		 	    
+	    uX1 = &umpX1[64 * tipX1[i]];
+	    uX2 = &umpX2[64 * tipX2[i]];		  
+	    
+	    for(k = 0; k < 4; k++)
+	      {
+		__m256d	   
+		  xv = _mm256_setzero_pd();
+	       
+		int 
+		  l;
+		
+		for(l = 0; l < 4; l++)
+		  {	       	     				      	      																	   
+		    __m256d
+		      x1v =  _mm256_mul_pd(_mm256_load_pd(&uX1[k * 16 + l * 4]), _mm256_load_pd(&uX2[k * 16 + l * 4]));
+		
+		    __m256d 
+		      evv = _mm256_load_pd(&extEV[l * 4]);
+#ifdef _FMA
+		    xv = FMAMACC(xv,x1v,evv);
+#else						  
+		    xv = _mm256_add_pd(xv, _mm256_mul_pd(x1v, evv));
+#endif
+		  }
+		
+		_mm256_store_pd(&x3[16 * i + 4 * k], xv);
+	      }	         	   	    
+	  }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+	double 
+	  *uX1;
+	PLL_ALIGN_BEGIN double
+	  umpX1[1024] PLL_ALIGN_END;
+
+	for (i = 1; i < 16; i++)
+	  {
+	    __m256d 
+	      tv = _mm256_load_pd(&(tipVector[i*4]));
+
+	    int 
+	      j;
+	    
+	    for (j = 0; j < 4; j++)
+	      for (k = 0; k < 4; k++)
+		{		 
+		  __m256d 
+		    left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+
+		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = hadd3(left1);
+		  		  		  
+		  _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		}	 	   
+	  }   	
+	
+	for(i = 0; i < n; i++)
+	  { 
+	    __m256d
+	      xv[4];	    	   
+	    
+	    scale = 1;
+	    uX1 = &umpX1[64 * tipX1[i]];
+
+	    for(k = 0; k < 4; k++)
+	      {
+		__m256d	   		 
+		  xvr = _mm256_load_pd(&(x2[i * 16 + k * 4]));
+
+		int 
+		  l;
+
+		xv[k]  = _mm256_setzero_pd();
+		  
+		for(l = 0; l < 4; l++)
+		  {	       	     				      	      															
+		    __m256d  
+		      x1v = _mm256_load_pd(&uX1[k * 16 + l * 4]),		     
+		      x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+			
+		    x2v = hadd3(x2v);
+		    x1v = _mm256_mul_pd(x1v, x2v);			
+		
+		    __m256d 
+		      evv = _mm256_load_pd(&extEV[l * 4]);
+			
+#ifdef _FMA
+		    xv[k] = FMAMACC(xv[k],x1v,evv);
+#else			  
+		    xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+#endif
+		  }
+		    
+		if(scale)
+		  {
+		    __m256d 	     
+		      v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+
+		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    
+		    if(_mm256_movemask_pd( v1 ) != 15)
+		      scale = 0;
+		  }
+	      }	    
+
+	    if(scale)
+	      {
+		xv[0] = _mm256_mul_pd(xv[0], twoto);
+		xv[1] = _mm256_mul_pd(xv[1], twoto);
+		xv[2] = _mm256_mul_pd(xv[2], twoto);
+		xv[3] = _mm256_mul_pd(xv[3], twoto);
+
+		if(useFastScaling)
+		  addScale += wgt[i];
+		else
+		  ex3[i] += 1;
+	      }
+
+	    _mm256_store_pd(&x3[16 * i],      xv[0]);
+	    _mm256_store_pd(&x3[16 * i + 4],  xv[1]);
+	    _mm256_store_pd(&x3[16 * i + 8],  xv[2]);
+	    _mm256_store_pd(&x3[16 * i + 12], xv[3]);
+	  }
+      }
+      break;
+    case PLL_INNER_INNER:
+      {
+	for(i = 0; i < n; i++)
+	  {	
+	    __m256d
+	      xv[4];
+	    
+	    scale = 1;
+
+	    for(k = 0; k < 4; k++)
+	      {
+		__m256d	   
+		 
+		  xvl = _mm256_load_pd(&(x1[i * 16 + k * 4])),
+		  xvr = _mm256_load_pd(&(x2[i * 16 + k * 4]));
+
+		int 
+		  l;
+
+		xv[k] = _mm256_setzero_pd();
+
+		for(l = 0; l < 4; l++)
+		  {	       	     				      	      															
+		    __m256d 
+		      x1v = _mm256_mul_pd(xvl, _mm256_load_pd(&left[k * 16 + l * 4])),
+		      x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+			
+		    x1v = hadd4(x1v, x2v);			
+		
+		    __m256d 
+		      evv = _mm256_load_pd(&extEV[l * 4]);
+						  
+		    xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+		  }
+		
+		if(scale)
+		  {
+		    __m256d 	     
+		      v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+
+		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    
+		    if(_mm256_movemask_pd( v1 ) != 15)
+		      scale = 0;
+		  }
+	      }
+
+	     if(scale)
+	      {
+		xv[0] = _mm256_mul_pd(xv[0], twoto);
+		xv[1] = _mm256_mul_pd(xv[1], twoto);
+		xv[2] = _mm256_mul_pd(xv[2], twoto);
+		xv[3] = _mm256_mul_pd(xv[3], twoto);
+
+		if(useFastScaling)
+		  addScale += wgt[i];
+		else
+		  ex3[i] += 1;		
+	      }
+		
+	    _mm256_store_pd(&x3[16 * i],      xv[0]);
+	    _mm256_store_pd(&x3[16 * i + 4],  xv[1]);
+	    _mm256_store_pd(&x3[16 * i + 8],  xv[2]);
+	    _mm256_store_pd(&x3[16 * i + 12], xv[3]);
+	  }
+      }
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+  
+}
+
+void  newviewGTRGAMMA_AVX_GAPPED_SAVE(int tipCase,
+				      double *x1_start, double *x2_start, double *x3_start,
+				      double *extEV, double *tipVector,
+				      int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+				      const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+				      unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, 
+				      double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn
+				      )
+{
+ 
+  int  
+    i, 
+    k, 
+    scale,
+    scaleGap,
+    addScale = 0;
+ 
+  __m256d 
+    minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD ),
+    twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+ 
+  double
+    *x1,
+    *x2,
+    *x3,
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start;
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+	double 
+	  *uX1, *uX2;
+	PLL_ALIGN_BEGIN double
+	  umpX1[1024] PLL_ALIGN_END,
+	  umpX2[1024] PLL_ALIGN_END;
+
+	for (i = 1; i < 16; i++)
+	  {
+	    __m256d 
+	      tv = _mm256_load_pd(&(tipVector[i * 4]));
+
+	    int 
+	      j;
+	    
+	    for (j = 0; j < 4; j++)
+	      for (k = 0; k < 4; k++)
+		{		 
+		  __m256d 
+		    left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+
+		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = hadd3(left1);
+		  		  		  
+		  _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		}
+	  
+	    for (j = 0; j < 4; j++)
+	      for (k = 0; k < 4; k++)
+		{		 
+		  __m256d 
+		    left1 = _mm256_load_pd(&right[j * 16 + k * 4]);		  		  		  
+
+		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = hadd3(left1);
+		  		  		  
+		  _mm256_store_pd(&umpX2[i * 64 + j * 16 + k * 4], left1);
+		}	    
+	  }   	
+	  
+	x3 = x3_gapColumn;
+
+	{
+	  uX1 = &umpX1[960];
+	  uX2 = &umpX2[960];		  
+	  
+	  for(k = 0; k < 4; k++)
+	    {
+	      __m256d	   
+		xv = _mm256_setzero_pd();
+	      
+	      int 
+		l;
+	      
+	      for(l = 0; l < 4; l++)
+		{	       	     				      	      																	   
+		  __m256d
+		    x1v =  _mm256_mul_pd(_mm256_load_pd(&uX1[k * 16 + l * 4]), _mm256_load_pd(&uX2[k * 16 + l * 4]));
+		  
+		  __m256d 
+		    evv = _mm256_load_pd(&extEV[l * 4]);
+#ifdef _FMA
+		  xv = FMAMACC(xv,x1v,evv);
+#else						  
+		  xv = _mm256_add_pd(xv, _mm256_mul_pd(x1v, evv));
+#endif
+		}
+		    
+	      _mm256_store_pd(&x3[4 * k], xv);
+	    }
+	}
+	
+	x3 = x3_start;
+
+	for(i = 0; i < n; i++)
+	  {		    	    	
+	    if(!(x3_gap[i / 32] & mask32[i % 32]))	     
+	      {
+		uX1 = &umpX1[64 * tipX1[i]];
+		uX2 = &umpX2[64 * tipX2[i]];		  
+	    
+		for(k = 0; k < 4; k++)
+		  {
+		    __m256d	   
+		      xv = _mm256_setzero_pd();
+	       
+		    int 
+		      l;
+		
+		    for(l = 0; l < 4; l++)
+		      {	       	     				      	      																	   
+			__m256d
+			  x1v =  _mm256_mul_pd(_mm256_load_pd(&uX1[k * 16 + l * 4]), _mm256_load_pd(&uX2[k * 16 + l * 4]));
+			
+			__m256d 
+			  evv = _mm256_load_pd(&extEV[l * 4]);
+#ifdef _FMA
+			xv = FMAMACC(xv,x1v,evv);
+#else						  
+			xv = _mm256_add_pd(xv, _mm256_mul_pd(x1v, evv));
+#endif
+		      }
+		    
+		    _mm256_store_pd(&x3[4 * k], xv);
+		  }
+
+		x3 += 16;
+	      }
+	  }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+	double 
+	  *uX1;
+	PLL_ALIGN_BEGIN double
+	  umpX1[1024] PLL_ALIGN_END;
+       
+	for (i = 1; i < 16; i++)
+	  {
+	    __m256d 
+	      tv = _mm256_load_pd(&(tipVector[i*4]));
+
+	    int 
+	      j;
+	    
+	    for (j = 0; j < 4; j++)
+	      for (k = 0; k < 4; k++)
+		{		 
+		  __m256d 
+		    left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+
+		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = hadd3(left1);
+		  		  		  
+		  _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		}	 	   
+	  }	
+
+	{ 
+	  __m256d
+	    xv[4];
+	  
+	  scaleGap = 1;
+	  uX1 = &umpX1[960];
+
+	  x2 = x2_gapColumn;			 
+	  x3 = x3_gapColumn;
+
+	  for(k = 0; k < 4; k++)
+	    {
+	      __m256d	   		 
+		xvr = _mm256_load_pd(&(x2[k * 4]));
+
+	      int 
+		l;
+
+	      xv[k]  = _mm256_setzero_pd();
+		  
+	      for(l = 0; l < 4; l++)
+		{	       	     				      	      															
+		  __m256d  
+		    x1v = _mm256_load_pd(&uX1[k * 16 + l * 4]),		     
+		    x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+			
+		  x2v = hadd3(x2v);
+		  x1v = _mm256_mul_pd(x1v, x2v);			
+		
+		  __m256d 
+		    evv = _mm256_load_pd(&extEV[l * 4]);
+			
+#ifdef _FMA
+		  xv[k] = FMAMACC(xv[k],x1v,evv);
+#else			  
+		  xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+#endif
+		}
+		    
+	      if(scaleGap)
+		{
+		  __m256d 	     
+		    v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+		  
+		  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    
+		  if(_mm256_movemask_pd( v1 ) != 15)
+		    scaleGap = 0;
+		}
+	    }
+	
+	  if(scaleGap)
+	    {
+	      xv[0] = _mm256_mul_pd(xv[0], twoto);
+	      xv[1] = _mm256_mul_pd(xv[1], twoto);
+	      xv[2] = _mm256_mul_pd(xv[2], twoto);
+	      xv[3] = _mm256_mul_pd(xv[3], twoto);	    
+	    }
+
+	  _mm256_store_pd(&x3[0],      xv[0]);
+	  _mm256_store_pd(&x3[4],  xv[1]);
+	  _mm256_store_pd(&x3[8],  xv[2]);
+	  _mm256_store_pd(&x3[12], xv[3]);
+	}
+	
+	x3 = x3_start;
+	
+	for(i = 0; i < n; i++)
+	  {
+	    if((x3_gap[i / 32] & mask32[i % 32]))
+	      {
+		if(scaleGap)
+		  {
+		    if(useFastScaling)
+		      addScale += wgt[i];
+		    else
+		      ex3[i]  += 1;
+		  }
+	      }
+	    else
+	      {
+		if(x2_gap[i / 32] & mask32[i % 32])
+		  x2 = x2_gapColumn;
+		else
+		  {
+		    x2 = x2_ptr;
+		    x2_ptr += 16;
+		  }
+		
+		__m256d
+		  xv[4];	    	   
+		
+		scale = 1;
+		uX1 = &umpX1[64 * tipX1[i]];
+		
+		for(k = 0; k < 4; k++)
+		  {
+		    __m256d	   		 
+		      xvr = _mm256_load_pd(&(x2[k * 4]));
+		    
+		    int 
+		      l;
+		    
+		    xv[k]  = _mm256_setzero_pd();
+		    
+		    for(l = 0; l < 4; l++)
+		      {	       	     				      	      															
+			__m256d  
+			  x1v = _mm256_load_pd(&uX1[k * 16 + l * 4]),		     
+			  x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+			
+			x2v = hadd3(x2v);
+			x1v = _mm256_mul_pd(x1v, x2v);			
+			
+			__m256d 
+			  evv = _mm256_load_pd(&extEV[l * 4]);
+			
+#ifdef _FMA
+			xv[k] = FMAMACC(xv[k],x1v,evv);
+#else			  
+			xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+#endif
+		      }
+		    
+		    if(scale)
+		      {
+			__m256d 	     
+			  v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+			
+			v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+			
+			if(_mm256_movemask_pd( v1 ) != 15)
+			  scale = 0;
+		      }
+		  }	    
+	      
+		if(scale)
+		  {
+		    xv[0] = _mm256_mul_pd(xv[0], twoto);
+		    xv[1] = _mm256_mul_pd(xv[1], twoto);
+		    xv[2] = _mm256_mul_pd(xv[2], twoto);
+		    xv[3] = _mm256_mul_pd(xv[3], twoto);
+
+		    if(useFastScaling)
+		      addScale += wgt[i];
+		    else
+		      ex3[i] += 1;		   
+		  }
+	      
+		_mm256_store_pd(&x3[0],      xv[0]);
+		_mm256_store_pd(&x3[4],  xv[1]);
+		_mm256_store_pd(&x3[8],  xv[2]);
+		_mm256_store_pd(&x3[12], xv[3]);
+	      
+		x3 += 16;
+	      }
+	  }
+      }
+      break;
+    case PLL_INNER_INNER:
+      {          
+	{		
+	  x1 = x1_gapColumn;	     	    
+	  x2 = x2_gapColumn;	    
+	  x3 = x3_gapColumn;
+
+	  __m256d
+	    xv[4];
+	    
+	  scaleGap = 1;
+
+	  for(k = 0; k < 4; k++)
+	    {
+	      __m256d	   
+		
+		xvl = _mm256_load_pd(&(x1[k * 4])),
+		xvr = _mm256_load_pd(&(x2[k * 4]));
+
+	      int 
+		l;
+
+	      xv[k] = _mm256_setzero_pd();
+
+	      for(l = 0; l < 4; l++)
+		{	       	     				      	      															
+		  __m256d 
+		    x1v = _mm256_mul_pd(xvl, _mm256_load_pd(&left[k * 16 + l * 4])),
+		    x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+		  
+		  x1v = hadd4(x1v, x2v);			
+		  
+		  __m256d 
+		    evv = _mm256_load_pd(&extEV[l * 4]);
+		  
+		  xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+		}
+		
+	      if(scaleGap)
+		  {
+		    __m256d 	     
+		      v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+
+		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    
+		    if(_mm256_movemask_pd( v1 ) != 15)
+		      scaleGap = 0;
+		  }
+	    }
+
+	  if(scaleGap)
+	    {
+	      xv[0] = _mm256_mul_pd(xv[0], twoto);
+	      xv[1] = _mm256_mul_pd(xv[1], twoto);
+	      xv[2] = _mm256_mul_pd(xv[2], twoto);
+	      xv[3] = _mm256_mul_pd(xv[3], twoto);	       
+	    }
+		
+	  _mm256_store_pd(&x3[0],  xv[0]);
+	  _mm256_store_pd(&x3[4],  xv[1]);
+	  _mm256_store_pd(&x3[8],  xv[2]);
+	  _mm256_store_pd(&x3[12], xv[3]);
+	}	  
+      
+	x3 = x3_start;
+
+	for(i = 0; i < n; i++)
+	  {
+	    if(x3_gap[i / 32] & mask32[i % 32])
+	      {	     
+		if(scaleGap)
+		  {
+		    if(useFastScaling)
+		      addScale += wgt[i];
+		    else
+		      ex3[i]  += 1; 	       
+		  }
+	      }
+	    else
+	      {	
+		if(x1_gap[i / 32] & mask32[i % 32])
+		  x1 = x1_gapColumn;
+		else
+		  {
+		    x1 = x1_ptr;
+		    x1_ptr += 16;
+		  }
+	     
+		if(x2_gap[i / 32] & mask32[i % 32])
+		  x2 = x2_gapColumn;
+		else
+		  {
+		    x2 = x2_ptr;
+		    x2_ptr += 16;
+		  }
+
+		__m256d
+		  xv[4];
+	    
+		scale = 1;
+
+		for(k = 0; k < 4; k++)
+		  {
+		    __m256d	   
+		      
+		      xvl = _mm256_load_pd(&(x1[k * 4])),
+		      xvr = _mm256_load_pd(&(x2[k * 4]));
+		    
+		    int 
+		      l;
+		    
+		    xv[k] = _mm256_setzero_pd();
+		    
+		    for(l = 0; l < 4; l++)
+		      {	       	     				      	      															
+			__m256d 
+			  x1v = _mm256_mul_pd(xvl, _mm256_load_pd(&left[k * 16 + l * 4])),
+			  x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+			
+			x1v = hadd4(x1v, x2v);			
+			
+			__m256d 
+			  evv = _mm256_load_pd(&extEV[l * 4]);
+			
+			xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+		      }
+		    
+		    if(scale)
+		      {
+			__m256d 	     
+			  v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+			
+			v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+			
+			if(_mm256_movemask_pd( v1 ) != 15)
+			  scale = 0;
+		      }
+		  }
+
+		if(scale)
+		  {
+		    xv[0] = _mm256_mul_pd(xv[0], twoto);
+		    xv[1] = _mm256_mul_pd(xv[1], twoto);
+		    xv[2] = _mm256_mul_pd(xv[2], twoto);
+		    xv[3] = _mm256_mul_pd(xv[3], twoto);
+		    
+		    if(useFastScaling)
+		      addScale += wgt[i];
+		    else
+		      ex3[i] += 1;
+		  }
+		
+		_mm256_store_pd(&x3[0],      xv[0]);
+		_mm256_store_pd(&x3[4],  xv[1]);
+		_mm256_store_pd(&x3[8],  xv[2]);
+		_mm256_store_pd(&x3[12], xv[3]);
+	      
+		x3 += 16;
+	      }
+	  }
+      }
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+  
+}
+
+
+
+
+void newviewGTRCAT_AVX(int tipCase,  double *EV,  int *cptr,
+			   double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+			   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+			   int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling)
+{
+  double
+    *le,
+    *ri,
+    *x1,
+    *x2;
+    
+  int 
+    i, 
+    addScale = 0;
+   
+  __m256d 
+    minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD ),
+    twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+  
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:      
+      for (i = 0; i < n; i++)
+	{	 
+	  int 
+	    l;
+	  
+	  le = &left[cptr[i] * 16];
+	  ri = &right[cptr[i] * 16];
+
+	  x1 = &(tipVector[4 * tipX1[i]]);
+	  x2 = &(tipVector[4 * tipX2[i]]);
+	  
+	  __m256d	   
+	    vv = _mm256_setzero_pd();
+	   	   	    
+	  for(l = 0; l < 4; l++)
+	    {	       	     				      	      															
+	      __m256d 
+		x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
+		x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+			
+	      x1v = hadd4(x1v, x2v);			
+		
+	      __m256d 
+		evv = _mm256_load_pd(&EV[l * 4]);
+#ifdef _FMA
+	      vv = FMAMACC(vv,x1v,evv);
+#else				
+	      vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+#endif
+	    }	  		  
+
+	  _mm256_store_pd(&x3_start[4 * i], vv);	    	   	    
+	}
+      break;
+    case PLL_TIP_INNER:      
+      for (i = 0; i < n; i++)
+	{
+	  int 
+	    l;
+
+	  x1 = &(tipVector[4 * tipX1[i]]);
+	  x2 = &x2_start[4 * i];	 
+	  
+	  le =  &left[cptr[i] * 16];
+	  ri =  &right[cptr[i] * 16];
+
+	  __m256d	   
+	    vv = _mm256_setzero_pd();
+	  
+	  for(l = 0; l < 4; l++)
+	    {	       	     				      	      															
+	      __m256d 
+		x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
+		x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+			
+	      x1v = hadd4(x1v, x2v);			
+		
+	      __m256d 
+		evv = _mm256_load_pd(&EV[l * 4]);
+				
+#ifdef _FMA
+	      vv = FMAMACC(vv,x1v,evv);
+#else	      
+	      vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));
+#endif
+	    }	  		  
+	  
+	  
+	  __m256d 	     
+	    v1 = _mm256_and_pd(vv, absMask_AVX.m);
+
+	  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	    
+	  if(_mm256_movemask_pd( v1 ) == 15)
+	    {	     	      
+	      vv = _mm256_mul_pd(vv, twoto);	      
+	      
+	      if(useFastScaling)
+		addScale += wgt[i];
+	      else
+		ex3[i] += 1;	      	     
+	    }       
+	  
+	  _mm256_store_pd(&x3_start[4 * i], vv);	 	  	  
+	}
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+	{
+	  int 
+	    l;
+
+	  x1 = &x1_start[4 * i];
+	  x2 = &x2_start[4 * i];
+	  
+	  
+	  le =  &left[cptr[i] * 16];
+	  ri =  &right[cptr[i] * 16];
+
+	  __m256d	   
+	    vv = _mm256_setzero_pd();
+	  
+	  for(l = 0; l < 4; l++)
+	    {	       	     				      	      															
+	      __m256d 
+		x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
+		x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+			
+	      x1v = hadd4(x1v, x2v);			
+		
+	      __m256d 
+		evv = _mm256_load_pd(&EV[l * 4]);
+#ifdef _FMA
+	      vv = FMAMACC(vv,x1v,evv);
+#else						
+	      vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+#endif
+	    }	  		  
+
+	 
+	  __m256d 	     
+	    v1 = _mm256_and_pd(vv, absMask_AVX.m);
+
+	  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	    
+	  if(_mm256_movemask_pd( v1 ) == 15)
+	    {	
+	      vv = _mm256_mul_pd(vv, twoto);
+	      
+	      if(useFastScaling)
+		addScale += wgt[i];
+	      else
+		ex3[i] += 1;	   
+	    }	
+
+	  _mm256_store_pd(&x3_start[4 * i], vv);
+	  	  
+	}
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+void newviewGTRCAT_AVX_GAPPED_SAVE(int tipCase,  double *EV,  int *cptr,
+				   double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+				   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+				   int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+				   unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+				   double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats)
+{
+  double
+    *le,
+    *ri,
+    *x1,
+    *x2, 
+    *x3,
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start, 
+    *x3_ptr = x3_start;
+  
+  int 
+    i, 
+    scaleGap = 0,
+    addScale = 0;
+   
+  __m256d 
+    minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD ),
+    twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+  
+
+  {
+    int 
+      l;
+
+    x1 = x1_gapColumn;	      
+    x2 = x2_gapColumn;
+    x3 = x3_gapColumn;    	 
+	  	  
+    le =  &left[maxCats * 16];
+    ri =  &right[maxCats * 16];
+
+    __m256d	   
+      vv = _mm256_setzero_pd();
+	  
+    for(l = 0; l < 4; l++)
+      {	       	     				      	      															
+	__m256d 
+	  x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
+	  x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+	
+	x1v = hadd4(x1v, x2v);			
+	
+	__m256d 
+	  evv = _mm256_load_pd(&EV[l * 4]);
+#ifdef _FMA
+	vv = FMAMACC(vv,x1v,evv);
+#else						
+	vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+#endif
+      }	  		  
+
+    if(tipCase != PLL_TIP_TIP)
+      {
+	__m256d 	     
+	  v1 = _mm256_and_pd(vv, absMask_AVX.m);
+    
+	v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+    
+	if(_mm256_movemask_pd( v1 ) == 15)
+	  {
+	    vv = _mm256_mul_pd(vv, twoto);	      	 
+	    scaleGap = 1;
+	  }
+      }
+    
+    _mm256_store_pd(x3, vv);    
+  }
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:      
+      for (i = 0; i < n; i++)
+	{ 
+	  if(noGap(x3_gap, i))
+	    {	 
+	      int 
+		l;
+	      
+	      x1 = &(tipVector[4 * tipX1[i]]);
+	      x2 = &(tipVector[4 * tipX2[i]]);
+
+	      x3 = x3_ptr;
+
+	      if(isGap(x1_gap, i))
+		le =  &left[maxCats * 16];
+	      else	  	  
+		le =  &left[cptr[i] * 16];	  
+	      
+	      if(isGap(x2_gap, i))
+		ri =  &right[maxCats * 16];
+	      else	 	  
+		ri =  &right[cptr[i] * 16];
+	  	  
+	      __m256d	   
+		vv = _mm256_setzero_pd();
+	      
+	      for(l = 0; l < 4; l++)
+		{	       	     				      	      															
+		  __m256d 
+		    x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
+		    x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+		  
+		  x1v = hadd4(x1v, x2v);			
+		  
+		  __m256d 
+		    evv = _mm256_load_pd(&EV[l * 4]);
+#ifdef _FMA
+		  vv = FMAMACC(vv,x1v,evv);
+#else				
+		  vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+#endif
+		}	  		  
+
+	      _mm256_store_pd(x3, vv);	 
+	      
+	      x3_ptr += 4;
+	    }
+	}
+      break;
+    case PLL_TIP_INNER:      
+      for (i = 0; i < n; i++)
+	{ 
+	  if(isGap(x3_gap, i))
+	    {
+	      if(scaleGap)
+		{
+		  if(useFastScaling)
+		    addScale += wgt[i];
+		  else
+		    ex3[i] += 1;		   		    
+		}	       
+	    }
+	  else
+	    {
+	      int 
+		l;
+
+	      x1 = &(tipVector[4 * tipX1[i]]);    
+	      x3 = x3_ptr;
+
+	      if(isGap(x1_gap, i))
+		le =  &left[maxCats * 16];
+	      else
+		le =  &left[cptr[i] * 16];
+	  
+	      if(isGap(x2_gap, i))
+		{		 
+		  ri =  &right[maxCats * 16];
+		  x2 = x2_gapColumn;
+		}
+	      else
+		{
+		  ri =  &right[cptr[i] * 16];
+		  x2 = x2_ptr;
+		  x2_ptr += 4;
+		}	  	 
+
+	      __m256d	   
+		vv = _mm256_setzero_pd();
+	      
+	      for(l = 0; l < 4; l++)
+		{	       	     				      	      															
+		  __m256d 
+		    x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
+		    x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+		  
+		  x1v = hadd4(x1v, x2v);			
+		  
+		  __m256d 
+		    evv = _mm256_load_pd(&EV[l * 4]);
+		  
+#ifdef _FMA
+		  vv = FMAMACC(vv,x1v,evv);
+#else	      
+		  vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));
+#endif
+		}	  		  
+	  
+	  
+	      __m256d 	     
+		v1 = _mm256_and_pd(vv, absMask_AVX.m);
+	      
+	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      
+	      if(_mm256_movemask_pd( v1 ) == 15)
+		{	     	      
+		  vv = _mm256_mul_pd(vv, twoto);	      
+		  
+		  if(useFastScaling)
+		    addScale += wgt[i];
+		  else
+		    ex3[i] += 1;		 
+		}       
+	  
+	      _mm256_store_pd(x3, vv);	 	  	  
+
+	      x3_ptr += 4;
+	    }
+	}
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+	{
+	  if(isGap(x3_gap, i))
+	    {
+	      if(scaleGap)		   		    
+		{
+		  if(useFastScaling)
+		    addScale += wgt[i];
+		  else
+		    ex3[i] += 1;
+		}	      
+	    }
+	  else
+	    {
+	      int 
+		l;
+	      
+	      x3 = x3_ptr;
+	      
+	      if(isGap(x1_gap, i))
+		{
+		  x1 = x1_gapColumn;
+		  le =  &left[maxCats * 16];
+		}
+	      else
+		{
+		  le =  &left[cptr[i] * 16];
+		  x1 = x1_ptr;
+		  x1_ptr += 4;
+		}
+
+	      if(isGap(x2_gap, i))	
+		{
+		  x2 = x2_gapColumn;
+		  ri =  &right[maxCats * 16];	    
+		}
+	      else
+		{
+		  ri =  &right[cptr[i] * 16];
+		  x2 = x2_ptr;
+		  x2_ptr += 4;
+		}	 	  	  	  
+	  
+	      __m256d	   
+		vv = _mm256_setzero_pd();
+	      
+	      for(l = 0; l < 4; l++)
+		{	       	     				      	      															
+		  __m256d 
+		    x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
+		    x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+		  
+		  x1v = hadd4(x1v, x2v);			
+		  
+		  __m256d 
+		    evv = _mm256_load_pd(&EV[l * 4]);
+#ifdef _FMA
+		  vv = FMAMACC(vv,x1v,evv);
+#else						
+		  vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+#endif
+		}	  		  
+	      
+	      
+	      __m256d 	     
+		v1 = _mm256_and_pd(vv, absMask_AVX.m);
+	      
+	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      
+	      if(_mm256_movemask_pd( v1 ) == 15)
+		{	
+		  vv = _mm256_mul_pd(vv, twoto);	      
+		  
+		  if(useFastScaling)
+		    addScale += wgt[i];
+		  else
+		    ex3[i] += 1;		
+		}	
+	      
+	      _mm256_store_pd(x3, vv);
+	      
+	      x3_ptr += 4;
+	    }	  	  
+	}
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+}
+
+void newviewGTRCATPROT_AVX(int tipCase, double *extEV,
+			       int *cptr,
+			       double *x1, double *x2, double *x3, double *tipVector,
+			       int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+			       int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling)
+{
+  double
+    *le, *ri, *v, *vl, *vr;
+
+  int i, l, scale, addScale = 0;
+
+#ifdef _FMA
+  int k;
+#endif
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+	for (i = 0; i < n; i++)
+	  {	   
+	    le = &left[cptr[i] * 400];
+	    ri = &right[cptr[i] * 400];
+
+	    vl = &(tipVector[20 * tipX1[i]]);
+	    vr = &(tipVector[20 * tipX2[i]]);
+	    v  = &x3[20 * i];	    	    	   	    
+
+	    __m256d vv[5];
+	    
+	    vv[0] = _mm256_setzero_pd();
+	    vv[1] = _mm256_setzero_pd();
+	    vv[2] = _mm256_setzero_pd();
+	    vv[3] = _mm256_setzero_pd();
+	    vv[4] = _mm256_setzero_pd();	   	    
+
+	    for(l = 0; l < 20; l++)
+	      {	       
+		__m256d 
+		  x1v = _mm256_setzero_pd(),
+		  x2v = _mm256_setzero_pd();	
+				
+		double 
+		  *ev = &extEV[l * 20],
+		  *lv = &le[l * 20],
+		  *rv = &ri[l * 20];														
+
+#ifdef _FMA		
+		for(k = 0; k < 20; k += 4) 
+		  {
+		    __m256d vlv = _mm256_load_pd(&vl[k]);
+		    __m256d lvv = _mm256_load_pd(&lv[k]);
+		    x1v = FMAMACC(x1v,vlv,lvv);
+		    __m256d vrv = _mm256_load_pd(&vr[k]);
+		    __m256d rvv = _mm256_load_pd(&rv[k]);
+		    x2v = FMAMACC(x2v,vrv,rvv);
+		  }
+#else		
+		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
+		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
+		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
+		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
+		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
+
+		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
+		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
+		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
+		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
+		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));	
+#endif
+
+		x1v = hadd4(x1v, x2v);			
+#ifdef _FMA
+		for(k = 0; k < 5; k++) 
+		  {
+		    __m256d evv = _mm256_load_pd(&ev[k*4]);
+		    vv[k] = FMAMACC(vv[k],x1v,evv);
+		  }	  
+#else		
+		__m256d 
+		  evv[5];
+	    	
+		evv[0] = _mm256_load_pd(&ev[0]);
+		evv[1] = _mm256_load_pd(&ev[4]);
+		evv[2] = _mm256_load_pd(&ev[8]);
+		evv[3] = _mm256_load_pd(&ev[12]);
+		evv[4] = _mm256_load_pd(&ev[16]);		
+		
+		vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
+		vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
+		vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
+		vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
+		vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      		      	  
+#endif
+	      }
+	    _mm256_store_pd(&v[0], vv[0]);
+	    _mm256_store_pd(&v[4], vv[1]);
+	    _mm256_store_pd(&v[8], vv[2]);
+	    _mm256_store_pd(&v[12], vv[3]);
+	    _mm256_store_pd(&v[16], vv[4]);
+	  }
+      }
+      break;
+    case PLL_TIP_INNER:      	
+      for (i = 0; i < n; i++)
+	{
+	  le = &left[cptr[i] * 400];
+	  ri = &right[cptr[i] * 400];
+	  
+	  vl = &(tipVector[20 * tipX1[i]]);
+	  vr = &x2[20 * i];
+	  v  = &x3[20 * i];	   
+	  
+	  __m256d vv[5];
+	  
+	  vv[0] = _mm256_setzero_pd();
+	  vv[1] = _mm256_setzero_pd();
+	  vv[2] = _mm256_setzero_pd();
+	  vv[3] = _mm256_setzero_pd();
+	  vv[4] = _mm256_setzero_pd();
+	  
+	 
+
+	  for(l = 0; l < 20; l++)
+	    {	       
+	      __m256d 
+		x1v = _mm256_setzero_pd(),
+		x2v = _mm256_setzero_pd();	
+	      
+	      double 
+		*ev = &extEV[l * 20],
+		*lv = &le[l * 20],
+		*rv = &ri[l * 20];														
+#ifdef _FMA
+	      for(k = 0; k < 20; k += 4) 
+		{
+		  __m256d vlv = _mm256_load_pd(&vl[k]);
+		  __m256d lvv = _mm256_load_pd(&lv[k]);
+		  x1v = FMAMACC(x1v,vlv,lvv);
+		  __m256d vrv = _mm256_load_pd(&vr[k]);
+		  __m256d rvv = _mm256_load_pd(&rv[k]);
+		  x2v = FMAMACC(x2v,vrv,rvv);
+		}
+#else	      
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
+	      
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+#endif
+
+	      x1v = hadd4(x1v, x2v);			
+	      
+	      __m256d 
+		evv[5];
+	      
+	      evv[0] = _mm256_load_pd(&ev[0]);
+	      evv[1] = _mm256_load_pd(&ev[4]);
+	      evv[2] = _mm256_load_pd(&ev[8]);
+	      evv[3] = _mm256_load_pd(&ev[12]);
+	      evv[4] = _mm256_load_pd(&ev[16]);		
+
+#ifdef _FMA
+	      for(k = 0; k < 5; k++)
+		vv[k] = FMAMACC(vv[k],x1v,evv[k]);		 
+#else	      
+	      vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
+	      vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
+	      vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
+	      vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
+	      vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+#endif
+	    }	  
+
+	   	     
+	  __m256d minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD );
+	  
+	  scale = 1;
+	  
+	  for(l = 0; scale && (l < 20); l += 4)
+	    {	       
+	      __m256d 
+		v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
+	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      
+	      if(_mm256_movemask_pd( v1 ) != 15)
+		scale = 0;
+	    }	    	  	  
+	 
+
+	  if(scale)
+	    {
+	      __m256d 
+		twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+	      
+	      for(l = 0; l < 20; l += 4)
+		vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+	  
+	      if(useFastScaling)
+		addScale += wgt[i];
+	      else
+		ex3[i]  += 1;	      
+	    }
+
+	  _mm256_store_pd(&v[0], vv[0]);
+	  _mm256_store_pd(&v[4], vv[1]);
+	  _mm256_store_pd(&v[8], vv[2]);
+	  _mm256_store_pd(&v[12], vv[3]);
+	  _mm256_store_pd(&v[16], vv[4]);	       
+	}
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+	{
+	  le = &left[cptr[i] * 400];
+	  ri = &right[cptr[i] * 400];
+
+	  vl = &x1[20 * i];
+	  vr = &x2[20 * i];
+	  v = &x3[20 * i];
+
+	  __m256d vv[5];
+	  
+	  vv[0] = _mm256_setzero_pd();
+	  vv[1] = _mm256_setzero_pd();
+	  vv[2] = _mm256_setzero_pd();
+	  vv[3] = _mm256_setzero_pd();
+	  vv[4] = _mm256_setzero_pd();
+	  
+	  for(l = 0; l < 20; l++)
+	    {	       
+	      __m256d 
+		x1v = _mm256_setzero_pd(),
+		x2v = _mm256_setzero_pd();	
+	      
+	      double 
+		*ev = &extEV[l * 20],
+		*lv = &le[l * 20],
+		*rv = &ri[l * 20];														
+	      
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
+	      
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+
+	      x1v = hadd4(x1v, x2v);			
+#ifdef _FMA
+	       for(k = 0; k < 5; k++) 
+		 {
+		   __m256d evv = _mm256_load_pd(&ev[k*4]);
+		   vv[k] = FMAMACC(vv[k],x1v,evv);
+		 }
+#else	      
+	      __m256d 
+		evv[5];
+	      
+	      evv[0] = _mm256_load_pd(&ev[0]);
+	      evv[1] = _mm256_load_pd(&ev[4]);
+	      evv[2] = _mm256_load_pd(&ev[8]);
+	      evv[3] = _mm256_load_pd(&ev[12]);
+	      evv[4] = _mm256_load_pd(&ev[16]);		
+	      
+	      vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
+	      vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
+	      vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
+	      vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
+	      vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+#endif
+	    }	  
+
+	   	     
+	  __m256d minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD );
+	  
+	  scale = 1;
+	  
+	  for(l = 0; scale && (l < 20); l += 4)
+	    {	       
+	      __m256d 
+		v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
+	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      
+	      if(_mm256_movemask_pd( v1 ) != 15)
+		scale = 0;
+	    }	    	  	  
+
+	  if(scale)
+	    {
+	      __m256d 
+		twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+	      
+	      for(l = 0; l < 20; l += 4)
+		vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+	  
+	      if(useFastScaling)
+		addScale += wgt[i];
+	      else
+		ex3[i]  += 1;	      
+	    }
+
+	  _mm256_store_pd(&v[0], vv[0]);
+	  _mm256_store_pd(&v[4], vv[1]);
+	  _mm256_store_pd(&v[8], vv[2]);
+	  _mm256_store_pd(&v[12], vv[3]);
+	  _mm256_store_pd(&v[16], vv[4]);
+	 
+	}
+      break;
+    default:
+      assert(0);
+    }
+  
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+}
+
+void newviewGTRCATPROT_AVX_GAPPED_SAVE(int tipCase, double *extEV,
+				       int *cptr,
+				       double *x1, double *x2, double *x3, double *tipVector,
+				       int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+				       int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+				       unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+				       double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats)
+{
+  double
+    *le, 
+    *ri, 
+    *v, 
+    *vl, 
+    *vr,
+    *x1_ptr = x1,
+    *x2_ptr = x2, 
+    *x3_ptr = x3;
+  
+  int 
+    i, 
+    l, 
+    scale, 
+    addScale = 0,
+    scaleGap = 0;
+
+#ifdef _FMA
+  int k;
+#endif
+
+  {
+    le = &left[maxCats * 400];
+    ri = &right[maxCats * 400];
+    
+    vl = x1_gapColumn;
+    vr = x2_gapColumn;
+    v  = x3_gapColumn;
+
+    __m256d vv[5];
+    
+    vv[0] = _mm256_setzero_pd();
+    vv[1] = _mm256_setzero_pd();
+    vv[2] = _mm256_setzero_pd();
+    vv[3] = _mm256_setzero_pd();
+    vv[4] = _mm256_setzero_pd();
+    
+    for(l = 0; l < 20; l++)
+      {	       
+	__m256d 
+	  x1v = _mm256_setzero_pd(),
+	  x2v = _mm256_setzero_pd();	
+	
+	double 
+	  *ev = &extEV[l * 20],
+	  *lv = &le[l * 20],
+	  *rv = &ri[l * 20];														
+	
+	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
+	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
+	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
+	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
+	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
+	
+	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
+	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
+	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
+	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
+	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+	
+	x1v = hadd4(x1v, x2v);			
+#ifdef _FMA
+	for(k = 0; k < 5; k++) 
+	  {
+	    __m256d evv = _mm256_load_pd(&ev[k*4]);
+	    vv[k] = FMAMACC(vv[k],x1v,evv);
+	  }
+#else	      
+	__m256d 
+	  evv[5];
+	
+	evv[0] = _mm256_load_pd(&ev[0]);
+	evv[1] = _mm256_load_pd(&ev[4]);
+	evv[2] = _mm256_load_pd(&ev[8]);
+	evv[3] = _mm256_load_pd(&ev[12]);
+	evv[4] = _mm256_load_pd(&ev[16]);		
+	
+	vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
+	vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
+	vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
+	vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
+	vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+#endif
+      }	  
+
+
+     if(tipCase != PLL_TIP_TIP)
+       {
+	 __m256d minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD );
+	  
+	 scale = 1;
+	  
+	 for(l = 0; scale && (l < 20); l += 4)
+	   {	       
+	     __m256d 
+	       v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
+	     v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	     
+	     if(_mm256_movemask_pd( v1 ) != 15)
+	       scale = 0;
+	   }	    	  	  
+
+	 if(scale)
+	   {
+	      __m256d 
+		twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+	      
+	      for(l = 0; l < 20; l += 4)
+		vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 	      	     	      
+	   
+	      scaleGap = 1;
+	   }
+       }
+
+     _mm256_store_pd(&v[0], vv[0]);
+     _mm256_store_pd(&v[4], vv[1]);
+     _mm256_store_pd(&v[8], vv[2]);
+     _mm256_store_pd(&v[12], vv[3]);
+     _mm256_store_pd(&v[16], vv[4]);     
+  }
+
+
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+	for (i = 0; i < n; i++)
+	  {
+	    if(noGap(x3_gap, i))	   
+	      {	    
+		vl = &(tipVector[20 * tipX1[i]]);
+		vr = &(tipVector[20 * tipX2[i]]);
+		v  = x3_ptr;	    	    	   	    
+
+		if(isGap(x1_gap, i))
+		  le =  &left[maxCats * 400];
+		else	  	  
+		  le =  &left[cptr[i] * 400];	  
+		
+		if(isGap(x2_gap, i))
+		  ri =  &right[maxCats * 400];
+		else	 	  
+		  ri =  &right[cptr[i] * 400];
+
+		__m256d vv[5];
+		
+		vv[0] = _mm256_setzero_pd();
+		vv[1] = _mm256_setzero_pd();
+		vv[2] = _mm256_setzero_pd();
+		vv[3] = _mm256_setzero_pd();
+		vv[4] = _mm256_setzero_pd();	   	    
+		
+		for(l = 0; l < 20; l++)
+		  {	       
+		    __m256d 
+		      x1v = _mm256_setzero_pd(),
+		      x2v = _mm256_setzero_pd();	
+		    
+		    double 
+		      *ev = &extEV[l * 20],
+		      *lv = &le[l * 20],
+		      *rv = &ri[l * 20];														
+		    
+#ifdef _FMA		
+		    for(k = 0; k < 20; k += 4) 
+		      {
+			__m256d vlv = _mm256_load_pd(&vl[k]);
+			__m256d lvv = _mm256_load_pd(&lv[k]);
+			x1v = FMAMACC(x1v,vlv,lvv);
+			__m256d vrv = _mm256_load_pd(&vr[k]);
+			__m256d rvv = _mm256_load_pd(&rv[k]);
+			x2v = FMAMACC(x2v,vrv,rvv);
+		      }
+#else		
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
+		    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));	
+#endif
+		    
+		    x1v = hadd4(x1v, x2v);			
+#ifdef _FMA
+		    for(k = 0; k < 5; k++) 
+		      {
+			__m256d evv = _mm256_load_pd(&ev[k*4]);
+			vv[k] = FMAMACC(vv[k],x1v,evv);
+		      }	  
+#else		
+		    __m256d 
+		      evv[5];
+		    
+		    evv[0] = _mm256_load_pd(&ev[0]);
+		    evv[1] = _mm256_load_pd(&ev[4]);
+		    evv[2] = _mm256_load_pd(&ev[8]);
+		    evv[3] = _mm256_load_pd(&ev[12]);
+		    evv[4] = _mm256_load_pd(&ev[16]);		
+		    
+		    vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
+		    vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
+		    vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
+		    vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
+		    vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      		      	  
+#endif
+		  }
+		
+		_mm256_store_pd(&v[0], vv[0]);
+		_mm256_store_pd(&v[4], vv[1]);
+		_mm256_store_pd(&v[8], vv[2]);
+		_mm256_store_pd(&v[12], vv[3]);
+		_mm256_store_pd(&v[16], vv[4]);
+
+		x3_ptr += 20;
+	      }
+	  }
+      }
+      break;
+    case PLL_TIP_INNER:      	
+      for (i = 0; i < n; i++)
+	{
+	  if(isGap(x3_gap, i))
+	    {
+	      if(scaleGap)
+		{
+		  if(useFastScaling)
+		    addScale += wgt[i];
+		  else
+		    ex3[i] += 1;		   		    
+		}	     
+	    }
+	  else
+	    {
+	      vl = &(tipVector[20 * tipX1[i]]);
+
+	      vr = x2_ptr;
+	      v = x3_ptr;
+	      
+	      if(isGap(x1_gap, i))
+		le =  &left[maxCats * 400];
+	      else
+		le =  &left[cptr[i] * 400];
+	      
+	      if(isGap(x2_gap, i))
+		{		 
+		  ri =  &right[maxCats * 400];
+		  vr = x2_gapColumn;
+		}
+	      else
+		{
+		  ri =  &right[cptr[i] * 400];
+		  vr = x2_ptr;
+		  x2_ptr += 20;
+		}	  	  
+	  
+	      __m256d vv[5];
+	      
+	      vv[0] = _mm256_setzero_pd();
+	      vv[1] = _mm256_setzero_pd();
+	      vv[2] = _mm256_setzero_pd();
+	      vv[3] = _mm256_setzero_pd();
+	      vv[4] = _mm256_setzero_pd();
+	      	      	      
+	      for(l = 0; l < 20; l++)
+		{	       
+		  __m256d 
+		    x1v = _mm256_setzero_pd(),
+		    x2v = _mm256_setzero_pd();	
+		  
+		  double 
+		    *ev = &extEV[l * 20],
+		    *lv = &le[l * 20],
+		    *rv = &ri[l * 20];														
+#ifdef _FMA
+		  for(k = 0; k < 20; k += 4) 
+		    {
+		      __m256d vlv = _mm256_load_pd(&vl[k]);
+		      __m256d lvv = _mm256_load_pd(&lv[k]);
+		      x1v = FMAMACC(x1v,vlv,lvv);
+		      __m256d vrv = _mm256_load_pd(&vr[k]);
+		      __m256d rvv = _mm256_load_pd(&rv[k]);
+		      x2v = FMAMACC(x2v,vrv,rvv);
+		    }
+#else	      
+		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
+		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
+		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
+		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
+		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
+		  
+		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
+		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
+		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
+		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
+		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+#endif
+		  
+		  x1v = hadd4(x1v, x2v);			
+		  
+		  __m256d 
+		    evv[5];
+		  
+		  evv[0] = _mm256_load_pd(&ev[0]);
+		  evv[1] = _mm256_load_pd(&ev[4]);
+		  evv[2] = _mm256_load_pd(&ev[8]);
+		  evv[3] = _mm256_load_pd(&ev[12]);
+		  evv[4] = _mm256_load_pd(&ev[16]);		
+		  
+#ifdef _FMA
+		  for(k = 0; k < 5; k++)
+		    vv[k] = FMAMACC(vv[k],x1v,evv[k]);		 
+#else	      
+		  vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
+		  vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
+		  vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
+		  vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
+		  vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+#endif
+		}	  
+
+	   	     
+	      __m256d minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD );
+	  
+	      scale = 1;
+	      
+	      for(l = 0; scale && (l < 20); l += 4)
+		{	       
+		  __m256d 
+		    v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
+		  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		  
+		  if(_mm256_movemask_pd( v1 ) != 15)
+		    scale = 0;
+		}	    	  	  
+	 
+	      if(scale)
+		{
+		  __m256d 
+		    twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+		  
+		  for(l = 0; l < 20; l += 4)
+		    vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+		  
+		  if(useFastScaling)
+		    addScale += wgt[i];
+		  else
+		    ex3[i]  += 1;	      
+		}
+
+	      _mm256_store_pd(&v[0], vv[0]);
+	      _mm256_store_pd(&v[4], vv[1]);
+	      _mm256_store_pd(&v[8], vv[2]);
+	      _mm256_store_pd(&v[12], vv[3]);
+	      _mm256_store_pd(&v[16], vv[4]);	       
+	      
+	      x3_ptr += 20;
+	    }
+	}    
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+	{
+	   if(isGap(x3_gap, i))
+	     {
+	       if(scaleGap)		   		    
+		 {
+		   if(useFastScaling)
+		     addScale += wgt[i];
+		   else
+		     ex3[i] += 1;
+		 }		 	       
+	     }
+	   else
+	     {
+
+	        v = x3_ptr;
+
+		if(isGap(x1_gap, i))
+		  {
+		    vl = x1_gapColumn;
+		    le =  &left[maxCats * 400];
+		  }
+		else
+		  {
+		    le =  &left[cptr[i] * 400];
+		    vl = x1_ptr;
+		    x1_ptr += 20;
+		  }
+		
+		if(isGap(x2_gap, i))	
+		  {
+		    vr = x2_gapColumn;
+		    ri =  &right[maxCats * 400];	    
+		  }
+		else
+		  {
+		    ri =  &right[cptr[i] * 400];
+		    vr = x2_ptr;
+		    x2_ptr += 20;
+		  }	 	  	 
+		
+		__m256d vv[5];
+		
+		vv[0] = _mm256_setzero_pd();
+		vv[1] = _mm256_setzero_pd();
+		vv[2] = _mm256_setzero_pd();
+		vv[3] = _mm256_setzero_pd();
+		vv[4] = _mm256_setzero_pd();
+		
+		for(l = 0; l < 20; l++)
+		  {	       
+		    __m256d 
+		      x1v = _mm256_setzero_pd(),
+		      x2v = _mm256_setzero_pd();	
+		    
+		    double 
+		      *ev = &extEV[l * 20],
+		      *lv = &le[l * 20],
+		      *rv = &ri[l * 20];														
+		    
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
+		    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+		    
+		    x1v = hadd4(x1v, x2v);			
+#ifdef _FMA
+		    for(k = 0; k < 5; k++) 
+		      {
+			__m256d evv = _mm256_load_pd(&ev[k*4]);
+			vv[k] = FMAMACC(vv[k],x1v,evv);
+		      }
+#else	      
+		    __m256d 
+		      evv[5];
+		    
+		    evv[0] = _mm256_load_pd(&ev[0]);
+		    evv[1] = _mm256_load_pd(&ev[4]);
+		    evv[2] = _mm256_load_pd(&ev[8]);
+		    evv[3] = _mm256_load_pd(&ev[12]);
+		    evv[4] = _mm256_load_pd(&ev[16]);		
+		    
+		    vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
+		    vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
+		    vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
+		    vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
+		    vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+#endif
+		  }	  
+
+	   	     
+		__m256d minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD );
+		
+		scale = 1;
+		
+		for(l = 0; scale && (l < 20); l += 4)
+		  {	       
+		    __m256d 
+		      v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
+		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    
+		    if(_mm256_movemask_pd( v1 ) != 15)
+		      scale = 0;
+		  }	    	  	  
+		
+		if(scale)
+		  {
+		    __m256d 
+		      twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+		    
+		    for(l = 0; l < 20; l += 4)
+		      vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+		    
+		    if(useFastScaling)
+		      addScale += wgt[i];
+		    else
+		      ex3[i]  += 1;	      
+		  }
+
+		_mm256_store_pd(&v[0], vv[0]);
+		_mm256_store_pd(&v[4], vv[1]);
+		_mm256_store_pd(&v[8], vv[2]);
+		_mm256_store_pd(&v[12], vv[3]);
+		_mm256_store_pd(&v[16], vv[4]);
+
+		 x3_ptr += 20;
+	     }
+	}   
+      break;
+    default:
+      assert(0);
+    }
+  
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+
+void newviewGTRGAMMAPROT_AVX_LG4(int tipCase,
+				 double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
+				 int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, 
+				 double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling) 
+{
+  double	
+    *uX1, 
+    *uX2, 
+    *v, 
+    x1px2, 
+    *vl, 
+    *vr;
+  
+  int	
+    i, 
+    j, 
+    l, 
+    k, 
+    scale, 
+    addScale = 0;
+
+ 
+#ifndef GCC_VERSION
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
+
+
+#if GCC_VERSION < 40500 && defined (__GNUC__)
+   __m256d
+    bitmask = _mm256_set_pd(0,0,0,-1);
+#else
+  __m256i
+    bitmask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+#endif 
+  
+  switch(tipCase) 
+    {
+    case PLL_TIP_TIP: 
+      {
+       
+    PLL_ALIGN_BEGIN double
+	  umpX1[1840] PLL_ALIGN_END,
+	  umpX2[1840] PLL_ALIGN_END;
+
+	
+	for(i = 0; i < 23; i++) 
+	  {	    	    
+	    for(k = 0; k < 80; k++) 
+	      {
+		double 
+		  *ll =  &left[k * 20],
+		  *rr =  &right[k * 20];
+		
+		__m256d 
+		  umpX1v = _mm256_setzero_pd(),
+		  umpX2v = _mm256_setzero_pd();
+		
+		v = &(tipVector[k / 20][20 * i]);
+
+		for(l = 0; l < 20; l+=4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+#ifdef _FMA
+		    __m256d llv = _mm256_load_pd(&ll[l]);
+		    umpX1v = FMAMACC(umpX1v,vv,llv);
+		    __m256d rrv = _mm256_load_pd(&rr[l]);
+		    umpX2v = FMAMACC(umpX2v,vv,rrv);
+#else		    
+		    umpX1v = _mm256_add_pd(umpX1v,_mm256_mul_pd(vv,_mm256_load_pd(&ll[l])));
+		    umpX2v = _mm256_add_pd(umpX2v,_mm256_mul_pd(vv,_mm256_load_pd(&rr[l])));
+#endif
+		  }
+		
+		umpX1v = hadd3(umpX1v);
+		umpX2v = hadd3(umpX2v);
+		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
+	      } 
+	  }
+
+	for(i = 0; i < n; i++) 
+	  {	    
+	    uX1 = &umpX1[80 * tipX1[i]];
+	    uX2 = &umpX2[80 * tipX2[i]];
+	   
+	    for(j = 0; j < 4; j++) 
+	      {     	
+		__m256d vv[5];  
+
+		v = &x3[i * 80 + j * 20];
+			
+		vv[0] = _mm256_setzero_pd();
+		vv[1] = _mm256_setzero_pd();
+		vv[2] = _mm256_setzero_pd();
+		vv[3] = _mm256_setzero_pd();
+		vv[4] = _mm256_setzero_pd();
+
+		for(k = 0; k < 20; k++) 
+		  {			 
+		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+
+		    __m256d x1px2v = _mm256_set1_pd(x1px2);		    
+		    
+		    __m256d extEvv = _mm256_load_pd(&extEV[j][20 * k]);
+#ifdef _FMA
+		    vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
+#else
+		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[0],vv[0]);
+		    
+		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 4]);
+#ifdef _FMA
+		    vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
+#else
+		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[4],vv[1]);
+
+		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 8]);
+#ifdef _FMA
+		    vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
+#else
+		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[8],vv[2]);
+
+		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 12]);
+#ifdef _FMA
+		    vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
+#else
+		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[12],vv[3]);
+
+		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 16]);
+#ifdef _FMA
+		    vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
+#else
+		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[16],vv[4]);
+		  } 
+	      } 
+	  } 
+      } 
+      break;
+    case PLL_TIP_INNER: 
+      {
+
+    	  PLL_ALIGN_BEGIN double
+	  umpX1[1840] PLL_ALIGN_END,
+	  ump_x2[20] PLL_ALIGN_END;
+
+	for(i = 0; i < 23; i++) 
+	  {	   
+	    for(k = 0; k < 80; k++) 
+	      {
+		__m256d umpX1v = _mm256_setzero_pd();
+		
+		 v = &(tipVector[k / 20][20 * i]);
+
+		for(l = 0; l < 20; l+=4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+		    __m256d leftv = _mm256_load_pd(&left[k * 20 + l]);
+#ifdef _FMA
+		   
+		    umpX1v = FMAMACC(umpX1v, vv, leftv);
+#else
+		    umpX1v = _mm256_add_pd(umpX1v, _mm256_mul_pd(vv, leftv));
+#endif
+		  }
+		umpX1v = hadd3(umpX1v);
+		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+	      } 
+	  }
+	
+	for (i = 0; i < n; i++) 
+	  {	   
+	    uX1 = &umpX1[80 * tipX1[i]];
+	   	    
+	    for(k = 0; k < 4; k++) 
+	      {
+		v = &(x2[80 * i + k * 20]);
+		
+		for(l = 0; l < 20; l++) 
+		  {
+		    __m256d ump_x2v = _mm256_setzero_pd();
+		    		  
+		    __m256d vv = _mm256_load_pd(&v[0]);
+		    __m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+		    
+		    vv = _mm256_load_pd(&v[4]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+4]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[8]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+8]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[12]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+12]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[16]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+16]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+		    
+		    ump_x2v = hadd3(ump_x2v);
+		    _mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+		  }
+		
+		v = &(x3[80 * i + 20 * k]);
+	
+
+		__m256d vv[5]; 
+
+		vv[0] = _mm256_setzero_pd();
+		vv[1] = _mm256_setzero_pd();
+		vv[2] = _mm256_setzero_pd();
+		vv[3] = _mm256_setzero_pd();
+		vv[4] = _mm256_setzero_pd();
+		
+		for(l = 0; l < 20; l++) 
+		  {
+		    x1px2 = uX1[k * 20 + l]	* ump_x2[l];
+		    __m256d x1px2v = _mm256_set1_pd(x1px2);	
+	    		 
+#ifdef _FMA
+		    __m256d ev = _mm256_load_pd(&extEV[k][l * 20 + 0]);
+		    vv[0] = FMAMACC(vv[0],x1px2v, ev);
+#else
+		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 0])));
+#endif
+		    _mm256_store_pd(&v[0],vv[0]);
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[k][l * 20 + 4]);
+		    vv[1] = FMAMACC(vv[1],x1px2v, ev);
+#else
+		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 4])));
+#endif
+		    _mm256_store_pd(&v[4],vv[1]);
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[k][l * 20 + 8]);
+		    vv[2] = FMAMACC(vv[2],x1px2v, ev);
+#else
+		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 8])));
+#endif
+		    _mm256_store_pd(&v[8],vv[2]);
+		    
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[k][l * 20 + 12]);
+		    vv[3] = FMAMACC(vv[3],x1px2v, ev);
+#else
+		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 12])));
+#endif
+		    _mm256_store_pd(&v[12],vv[3]);
+
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[k][l * 20 + 16]);
+		    vv[4] = FMAMACC(vv[4],x1px2v, ev);
+#else
+		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 16])));
+#endif
+		    _mm256_store_pd(&v[16],vv[4]);
+
+		  } 
+	      }
+	   
+	    v = &x3[80 * i];
+	    __m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);
+	    scale = 1;
+	    for(l = 0; scale && (l < 80); l += 4) 
+	      {
+		__m256d vv = _mm256_load_pd(&v[l]);
+		__m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+		vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+		if(_mm256_movemask_pd(vv_abs) != 15)
+		  scale = 0;
+	      }
+	    
+	    if(scale) 
+	      {		
+		__m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+		for(l = 0; l < 80; l += 4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+		    _mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+		  }
+		if(useFastScaling)
+		  addScale += wgt[i];				
+		else
+		  ex3[i] += 1;
+	      } 
+	  } 
+      } 
+      break;
+    case PLL_INNER_INNER:      
+      for(i = 0; i < n; i++) 
+	{ 
+	  scale = 1;
+	  
+	  for(k = 0; k < 4; k++) 
+	    {
+	      vl = &(x1[80 * i + 20 * k]);
+	      vr = &(x2[80 * i + 20 * k]);
+	      v  = &(x3[80 * i + 20 * k]);	      	   
+
+	      __m256d vv[5]; 
+	      
+	      vv[0] = _mm256_setzero_pd();
+	      vv[1] = _mm256_setzero_pd();
+	      vv[2] = _mm256_setzero_pd();
+	      vv[3] = _mm256_setzero_pd();
+	      vv[4] = _mm256_setzero_pd();
+	      
+	      for(l = 0; l < 20; l++) 
+		{		  
+		  __m256d al = _mm256_setzero_pd();
+		  __m256d ar = _mm256_setzero_pd();
+       		  
+		  __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+		  __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+		  __m256d vlv = _mm256_load_pd(&vl[0]);
+		  __m256d vrv = _mm256_load_pd(&vr[0]);
+		  
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+		  vlv = _mm256_load_pd(&vl[4]);
+		  vrv = _mm256_load_pd(&vr[4]);
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+		  vlv = _mm256_load_pd(&vl[8]);
+		  vrv = _mm256_load_pd(&vr[8]);
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+		  vlv = _mm256_load_pd(&vl[12]);
+		  vrv = _mm256_load_pd(&vr[12]);
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+		  vlv = _mm256_load_pd(&vl[16]);
+		  vrv = _mm256_load_pd(&vr[16]);
+
+#ifdef _FMA		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  /**************************************************************************************************************/
+
+		  al = hadd3(al);
+		  ar = hadd3(ar);
+		  al = _mm256_mul_pd(ar,al);
+		  
+		  /************************************************************************************************************/
+#ifdef _FMA		    
+		  __m256d ev =  _mm256_load_pd(&extEV[k][20 * l + 0]);
+		  vv[0] = FMAMACC(vv[0], al, ev);		 
+#else
+		  vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 0])));			  		 		  
+#endif
+		  _mm256_store_pd(&v[0],vv[0]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[k][20 * l + 4]);
+		  vv[1] = FMAMACC(vv[1], al, ev);		 
+#else
+		  vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 4])));		  		 
+#endif
+		  _mm256_store_pd(&v[4],vv[1]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[k][20 * l + 8]);
+		  vv[2] = FMAMACC(vv[2], al, ev);		 
+#else
+		  vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 8])));		  		 
+#endif
+		  _mm256_store_pd(&v[8],vv[2]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[k][20 * l + 12]);
+		  vv[3] = FMAMACC(vv[3], al, ev);		 
+#else
+		  vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 12])));		  		 
+#endif
+		  _mm256_store_pd(&v[12],vv[3]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[k][20 * l + 16]);
+		  vv[4] = FMAMACC(vv[4], al, ev);		 
+#else
+		  vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 16])));			 	  
+#endif
+		  _mm256_store_pd(&v[16],vv[4]);		 
+		} 
+	    }
+	  v = &(x3[80 * i]);
+	  scale = 1;
+	  __m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);	 
+
+	  for(l = 0; scale && (l < 80); l += 4) 
+	    {
+	      __m256d vv = _mm256_load_pd(&v[l]);
+	      __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+	      vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+	      if(_mm256_movemask_pd(vv_abs) != 15)
+		scale = 0;	     
+	    }
+
+	  if(scale) 
+	    {		     	      
+	      __m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+	      for(l = 0; l < 80; l += 4) 
+		{
+		  __m256d vv = _mm256_load_pd(&v[l]);
+		  _mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+		}
+	      if(useFastScaling)
+		addScale += wgt[i];					
+	      else
+		ex3[i] += 1;
+	    } 
+	}
+      break;
+    default:
+      assert(0);
+    }
+ 
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+}
+ 
+
+void newviewGTRGAMMAPROT_AVX(int tipCase,
+			     double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+			     int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, 
+			     double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling) 
+{
+  double	
+    *uX1, 
+    *uX2, 
+    *v, 
+    x1px2, 
+    *vl, 
+    *vr;
+  
+  int	
+    i, 
+    j, 
+    l, 
+    k, 
+    scale, 
+    addScale = 0;
+
+ 
+#ifndef GCC_VERSION
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
+
+
+#if GCC_VERSION < 40500 && defined(__GNUC__)
+   __m256d
+    bitmask = _mm256_set_pd(0,0,0,-1);
+#else
+  __m256i
+    bitmask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+#endif 
+  
+  switch(tipCase) 
+    {
+    case PLL_TIP_TIP: 
+      {
+       
+    PLL_ALIGN_BEGIN double
+	  umpX1[1840] PLL_ALIGN_END,
+	  umpX2[1840] PLL_ALIGN_END;
+
+	for(i = 0; i < 23; i++) 
+	  {
+	    v = &(tipVector[20 * i]);
+	    
+	    for(k = 0; k < 80; k++) 
+	      {
+		double 
+		  *ll =  &left[k * 20],
+		  *rr =  &right[k * 20];
+		
+		__m256d 
+		  umpX1v = _mm256_setzero_pd(),
+		  umpX2v = _mm256_setzero_pd();
+		
+		for(l = 0; l < 20; l+=4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+#ifdef _FMA
+		    __m256d llv = _mm256_load_pd(&ll[l]);
+		    umpX1v = FMAMACC(umpX1v,vv,llv);
+		    __m256d rrv = _mm256_load_pd(&rr[l]);
+		    umpX2v = FMAMACC(umpX2v,vv,rrv);
+#else		    
+		    umpX1v = _mm256_add_pd(umpX1v,_mm256_mul_pd(vv,_mm256_load_pd(&ll[l])));
+		    umpX2v = _mm256_add_pd(umpX2v,_mm256_mul_pd(vv,_mm256_load_pd(&rr[l])));
+#endif
+		  }
+		
+		umpX1v = hadd3(umpX1v);
+		umpX2v = hadd3(umpX2v);
+		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
+	      } 
+	  }
+
+	for(i = 0; i < n; i++) 
+	  {	    
+	    uX1 = &umpX1[80 * tipX1[i]];
+	    uX2 = &umpX2[80 * tipX2[i]];
+	   
+	    for(j = 0; j < 4; j++) 
+	      {     	
+		__m256d vv[5];  
+
+		v = &x3[i * 80 + j * 20];
+			
+		vv[0] = _mm256_setzero_pd();
+		vv[1] = _mm256_setzero_pd();
+		vv[2] = _mm256_setzero_pd();
+		vv[3] = _mm256_setzero_pd();
+		vv[4] = _mm256_setzero_pd();
+
+		for(k = 0; k < 20; k++) 
+		  {			 
+		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+
+		    __m256d x1px2v = _mm256_set1_pd(x1px2);		    
+		    
+		    __m256d extEvv = _mm256_load_pd(&extEV[20 * k]);
+#ifdef _FMA
+		    vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
+#else
+		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[0],vv[0]);
+		    
+		    extEvv = _mm256_load_pd(&extEV[20 * k + 4]);
+#ifdef _FMA
+		    vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
+#else
+		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[4],vv[1]);
+
+		    extEvv = _mm256_load_pd(&extEV[20 * k + 8]);
+#ifdef _FMA
+		    vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
+#else
+		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[8],vv[2]);
+
+		    extEvv = _mm256_load_pd(&extEV[20 * k + 12]);
+#ifdef _FMA
+		    vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
+#else
+		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[12],vv[3]);
+
+		    extEvv = _mm256_load_pd(&extEV[20 * k + 16]);
+#ifdef _FMA
+		    vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
+#else
+		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[16],vv[4]);
+		  } 
+	      } 
+	  } 
+      } 
+      break;
+    case PLL_TIP_INNER: 
+      {
+
+    	  PLL_ALIGN_BEGIN double
+	  umpX1[1840] PLL_ALIGN_END,
+	  ump_x2[20] PLL_ALIGN_END;
+
+	for(i = 0; i < 23; i++) 
+	  {
+	    v = &(tipVector[20 * i]);
+
+	    for(k = 0; k < 80; k++) 
+	      {
+		__m256d umpX1v = _mm256_setzero_pd();
+		for(l = 0; l < 20; l+=4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+		    __m256d leftv = _mm256_load_pd(&left[k * 20 + l]);
+#ifdef _FMA
+		   
+		    umpX1v = FMAMACC(umpX1v, vv, leftv);
+#else
+		    umpX1v = _mm256_add_pd(umpX1v, _mm256_mul_pd(vv, leftv));
+#endif
+		  }
+		umpX1v = hadd3(umpX1v);
+		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+	      } 
+	  }
+	
+	for (i = 0; i < n; i++) 
+	  {	   
+	    uX1 = &umpX1[80 * tipX1[i]];
+	   	    
+	    for(k = 0; k < 4; k++) 
+	      {
+		v = &(x2[80 * i + k * 20]);
+		
+		for(l = 0; l < 20; l++) 
+		  {
+		    __m256d ump_x2v = _mm256_setzero_pd();
+		    		  
+		    __m256d vv = _mm256_load_pd(&v[0]);
+		    __m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+		    
+		    vv = _mm256_load_pd(&v[4]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+4]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[8]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+8]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[12]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+12]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[16]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+16]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+		    
+		    ump_x2v = hadd3(ump_x2v);
+		    _mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+		  }
+		
+		v = &(x3[80 * i + 20 * k]);
+	
+
+		__m256d vv[5]; 
+
+		vv[0] = _mm256_setzero_pd();
+		vv[1] = _mm256_setzero_pd();
+		vv[2] = _mm256_setzero_pd();
+		vv[3] = _mm256_setzero_pd();
+		vv[4] = _mm256_setzero_pd();
+		
+		for(l = 0; l < 20; l++) 
+		  {
+		    x1px2 = uX1[k * 20 + l]	* ump_x2[l];
+		    __m256d x1px2v = _mm256_set1_pd(x1px2);	
+	    		 
+#ifdef _FMA
+		    __m256d ev = _mm256_load_pd(&extEV[l * 20 + 0]);
+		    vv[0] = FMAMACC(vv[0],x1px2v, ev);
+#else
+		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 0])));
+#endif
+		    _mm256_store_pd(&v[0],vv[0]);
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 4]);
+		    vv[1] = FMAMACC(vv[1],x1px2v, ev);
+#else
+		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 4])));
+#endif
+		    _mm256_store_pd(&v[4],vv[1]);
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 8]);
+		    vv[2] = FMAMACC(vv[2],x1px2v, ev);
+#else
+		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 8])));
+#endif
+		    _mm256_store_pd(&v[8],vv[2]);
+		    
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 12]);
+		    vv[3] = FMAMACC(vv[3],x1px2v, ev);
+#else
+		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 12])));
+#endif
+		    _mm256_store_pd(&v[12],vv[3]);
+
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 16]);
+		    vv[4] = FMAMACC(vv[4],x1px2v, ev);
+#else
+		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 16])));
+#endif
+		    _mm256_store_pd(&v[16],vv[4]);
+
+		  } 
+	      }
+	   
+	    v = &x3[80 * i];
+	    __m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);
+	    scale = 1;
+	    for(l = 0; scale && (l < 80); l += 4) 
+	      {
+		__m256d vv = _mm256_load_pd(&v[l]);
+		__m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+		vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+		if(_mm256_movemask_pd(vv_abs) != 15)
+		  scale = 0;
+	      }
+	    
+	    if(scale) 
+	      {		
+		__m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+		for(l = 0; l < 80; l += 4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+		    _mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+		  }
+		if(useFastScaling)
+		  addScale += wgt[i];				
+		else
+		  ex3[i] += 1;
+	      } 
+	  } 
+      } 
+      break;
+    case PLL_INNER_INNER:      
+      for(i = 0; i < n; i++) 
+	{ 
+	  scale = 1;
+	  
+	  for(k = 0; k < 4; k++) 
+	    {
+	      vl = &(x1[80 * i + 20 * k]);
+	      vr = &(x2[80 * i + 20 * k]);
+	      v  = &(x3[80 * i + 20 * k]);	      	   
+
+	      __m256d vv[5]; 
+	      
+	      vv[0] = _mm256_setzero_pd();
+	      vv[1] = _mm256_setzero_pd();
+	      vv[2] = _mm256_setzero_pd();
+	      vv[3] = _mm256_setzero_pd();
+	      vv[4] = _mm256_setzero_pd();
+	      
+	      for(l = 0; l < 20; l++) 
+		{		  
+		  __m256d al = _mm256_setzero_pd();
+		  __m256d ar = _mm256_setzero_pd();
+       		  
+		  __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+		  __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+		  __m256d vlv = _mm256_load_pd(&vl[0]);
+		  __m256d vrv = _mm256_load_pd(&vr[0]);
+		  
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+		  vlv = _mm256_load_pd(&vl[4]);
+		  vrv = _mm256_load_pd(&vr[4]);
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+		  vlv = _mm256_load_pd(&vl[8]);
+		  vrv = _mm256_load_pd(&vr[8]);
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+		  vlv = _mm256_load_pd(&vl[12]);
+		  vrv = _mm256_load_pd(&vr[12]);
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+		  vlv = _mm256_load_pd(&vl[16]);
+		  vrv = _mm256_load_pd(&vr[16]);
+
+#ifdef _FMA		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  /**************************************************************************************************************/
+
+		  al = hadd3(al);
+		  ar = hadd3(ar);
+		  al = _mm256_mul_pd(ar,al);
+		  
+		  /************************************************************************************************************/
+#ifdef _FMA		    
+		  __m256d ev =  _mm256_load_pd(&extEV[20 * l + 0]);
+		  vv[0] = FMAMACC(vv[0], al, ev);		 
+#else
+		  vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
+#endif
+		  _mm256_store_pd(&v[0],vv[0]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[20 * l + 4]);
+		  vv[1] = FMAMACC(vv[1], al, ev);		 
+#else
+		  vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 4])));		  		 
+#endif
+		  _mm256_store_pd(&v[4],vv[1]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[20 * l + 8]);
+		  vv[2] = FMAMACC(vv[2], al, ev);		 
+#else
+		  vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 8])));		  		 
+#endif
+		  _mm256_store_pd(&v[8],vv[2]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[20 * l + 12]);
+		  vv[3] = FMAMACC(vv[3], al, ev);		 
+#else
+		  vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 12])));		  		 
+#endif
+		  _mm256_store_pd(&v[12],vv[3]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[20 * l + 16]);
+		  vv[4] = FMAMACC(vv[4], al, ev);		 
+#else
+		  vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 16])));			 	  
+#endif
+		  _mm256_store_pd(&v[16],vv[4]);		 
+		} 
+	    }
+	  v = &(x3[80 * i]);
+	  scale = 1;
+	  __m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);	 
+
+	  for(l = 0; scale && (l < 80); l += 4) 
+	    {
+	      __m256d vv = _mm256_load_pd(&v[l]);
+	      __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+	      vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+	      if(_mm256_movemask_pd(vv_abs) != 15)
+		scale = 0;	     
+	    }
+
+	  if(scale) 
+	    {		     	      
+	      __m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+	      for(l = 0; l < 80; l += 4) 
+		{
+		  __m256d vv = _mm256_load_pd(&v[l]);
+		  _mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+		}
+	      if(useFastScaling)
+		addScale += wgt[i];					
+	      else
+		ex3[i] += 1;
+	    } 
+	}
+      break;
+    default:
+      assert(0);
+    }
+ 
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+
+void newviewGTRGAMMAPROT_AVX_GAPPED_SAVE(int tipCase,
+					 double *x1_start, double *x2_start, double *x3_start, double *extEV, double *tipVector,
+					 int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, 
+					 double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+					 unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, 
+					 double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn) 
+{
+  double	
+    *x1 = x1_start,
+    *x2 = x2_start,
+    *x3_ptr = x3_start,
+    *x2_ptr = x2_start,
+    *x1_ptr = x1_start,
+    *uX1, 
+    *uX2, 
+    *v, 
+    x1px2, 
+    *vl, 
+    *vr;
+  
+  int	
+    i, 
+    j, 
+    l, 
+    k, 
+    gapScaling = 0,
+    scale, 
+    addScale = 0;
+
+ 
+#ifndef GCC_VERSION
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
+
+
+#if GCC_VERSION < 40500 && defined(__GNUC__)
+   __m256d
+    bitmask = _mm256_set_pd(0,0,0,-1);
+#else
+  __m256i
+    bitmask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+#endif 
+  
+  switch(tipCase) 
+    {
+    case PLL_TIP_TIP: 
+      {       
+    	  PLL_ALIGN_BEGIN double
+	  umpX1[1840] PLL_ALIGN_END,
+	  umpX2[1840] PLL_ALIGN_END;
+
+
+
+	for(i = 0; i < 23; i++) 
+	  {
+	    v = &(tipVector[20 * i]);
+	    
+	    for(k = 0; k < 80; k++) 
+	      {
+		double 
+		  *ll =  &left[k * 20],
+		  *rr =  &right[k * 20];
+		
+		__m256d 
+		  umpX1v = _mm256_setzero_pd(),
+		  umpX2v = _mm256_setzero_pd();
+		
+		for(l = 0; l < 20; l+=4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+#ifdef _FMA
+		    __m256d llv = _mm256_load_pd(&ll[l]);
+		    umpX1v = FMAMACC(umpX1v,vv,llv);
+		    __m256d rrv = _mm256_load_pd(&rr[l]);
+		    umpX2v = FMAMACC(umpX2v,vv,rrv);
+#else		    
+		    umpX1v = _mm256_add_pd(umpX1v,_mm256_mul_pd(vv,_mm256_load_pd(&ll[l])));
+		    umpX2v = _mm256_add_pd(umpX2v,_mm256_mul_pd(vv,_mm256_load_pd(&rr[l])));
+#endif
+		  }
+		
+		umpX1v = hadd3(umpX1v);
+		umpX2v = hadd3(umpX2v);
+		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
+	      } 
+	  }
+
+	
+	{	    
+	  uX1 = &umpX1[1760];
+	  uX2 = &umpX2[1760];
+	  
+	  for(j = 0; j < 4; j++) 
+	    {     	
+	      __m256d vv[5];  
+	      
+	      v = &x3_gapColumn[j * 20];
+	      
+	      vv[0] = _mm256_setzero_pd();
+	      vv[1] = _mm256_setzero_pd();
+	      vv[2] = _mm256_setzero_pd();
+	      vv[3] = _mm256_setzero_pd();
+	      vv[4] = _mm256_setzero_pd();
+	      
+	      for(k = 0; k < 20; k++) 
+		{			 
+		  x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+		  
+		  __m256d x1px2v = _mm256_set1_pd(x1px2);		    
+		  
+		  __m256d extEvv = _mm256_load_pd(&extEV[20 * k]);
+#ifdef _FMA
+		  vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
+#else
+		  vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		  _mm256_store_pd(&v[0],vv[0]);
+		  
+		  extEvv = _mm256_load_pd(&extEV[20 * k + 4]);
+#ifdef _FMA
+		  vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
+#else
+		  vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		  _mm256_store_pd(&v[4],vv[1]);
+		  
+		  extEvv = _mm256_load_pd(&extEV[20 * k + 8]);
+#ifdef _FMA
+		  vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
+#else
+		  vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		  _mm256_store_pd(&v[8],vv[2]);
+		  
+		  extEvv = _mm256_load_pd(&extEV[20 * k + 12]);
+#ifdef _FMA
+		  vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
+#else
+		  vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		  _mm256_store_pd(&v[12],vv[3]);
+		  
+		  extEvv = _mm256_load_pd(&extEV[20 * k + 16]);
+#ifdef _FMA
+		  vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
+#else
+		  vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		  _mm256_store_pd(&v[16],vv[4]);
+		} 
+	    } 
+	}
+
+	
+	for(i = 0; i < n; i++) 
+	  {
+	    if(!(x3_gap[i / 32] & mask32[i % 32]))
+	      {	    
+		uX1 = &umpX1[80 * tipX1[i]];
+		uX2 = &umpX2[80 * tipX2[i]];
+	   
+		for(j = 0; j < 4; j++) 
+		  {     	
+		    __m256d vv[5];  
+		    
+		    v = &x3_ptr[j * 20];
+			
+		    vv[0] = _mm256_setzero_pd();
+		    vv[1] = _mm256_setzero_pd();
+		    vv[2] = _mm256_setzero_pd();
+		    vv[3] = _mm256_setzero_pd();
+		    vv[4] = _mm256_setzero_pd();
+
+		    for(k = 0; k < 20; k++) 
+		      {			 
+			x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+			
+			__m256d x1px2v = _mm256_set1_pd(x1px2);		    
+			
+			__m256d extEvv = _mm256_load_pd(&extEV[20 * k]);
+#ifdef _FMA
+			vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
+#else
+			vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+			_mm256_store_pd(&v[0],vv[0]);
+			
+			extEvv = _mm256_load_pd(&extEV[20 * k + 4]);
+#ifdef _FMA
+			vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
+#else
+			vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+			_mm256_store_pd(&v[4],vv[1]);
+			
+			extEvv = _mm256_load_pd(&extEV[20 * k + 8]);
+#ifdef _FMA
+			vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
+#else
+			vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+			_mm256_store_pd(&v[8],vv[2]);
+			
+			extEvv = _mm256_load_pd(&extEV[20 * k + 12]);
+#ifdef _FMA
+			vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
+#else
+			vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+			_mm256_store_pd(&v[12],vv[3]);
+			
+			extEvv = _mm256_load_pd(&extEV[20 * k + 16]);
+#ifdef _FMA
+			vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
+#else
+			vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+			_mm256_store_pd(&v[16],vv[4]);
+		      } 
+		  }
+		x3_ptr += 80;		  
+	      }
+	  }
+      }
+      break;
+    case PLL_TIP_INNER: 
+      {
+    	  PLL_ALIGN_BEGIN double
+	  umpX1[1840] PLL_ALIGN_END,
+	  ump_x2[20] PLL_ALIGN_END;
+
+
+
+	for(i = 0; i < 23; i++) 
+	  {
+	    v = &(tipVector[20 * i]);
+
+	    for(k = 0; k < 80; k++) 
+	      {
+		__m256d umpX1v = _mm256_setzero_pd();
+		for(l = 0; l < 20; l+=4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+		    __m256d leftv = _mm256_load_pd(&left[k * 20 + l]);
+#ifdef _FMA
+		   
+		    umpX1v = FMAMACC(umpX1v, vv, leftv);
+#else
+		    umpX1v = _mm256_add_pd(umpX1v, _mm256_mul_pd(vv, leftv));
+#endif
+		  }
+		umpX1v = hadd3(umpX1v);
+		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+	      } 
+	  }
+
+	{	   
+	  uX1 = &umpX1[1760];
+	   	    
+	  for(k = 0; k < 4; k++) 
+	    {
+	      v = &(x2_gapColumn[k * 20]);
+		
+		for(l = 0; l < 20; l++) 
+		  {
+		    __m256d ump_x2v = _mm256_setzero_pd();
+		    		  
+		    __m256d vv = _mm256_load_pd(&v[0]);
+		    __m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+		    
+		    vv = _mm256_load_pd(&v[4]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+4]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[8]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+8]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[12]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+12]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[16]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+16]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+		    
+		    ump_x2v = hadd3(ump_x2v);
+		    _mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+		  }
+		
+		v = &x3_gapColumn[20 * k];
+	
+		__m256d vv[5]; 
+
+		vv[0] = _mm256_setzero_pd();
+		vv[1] = _mm256_setzero_pd();
+		vv[2] = _mm256_setzero_pd();
+		vv[3] = _mm256_setzero_pd();
+		vv[4] = _mm256_setzero_pd();
+		
+		for(l = 0; l < 20; l++) 
+		  {
+		    x1px2 = uX1[k * 20 + l]	* ump_x2[l];
+		    __m256d x1px2v = _mm256_set1_pd(x1px2);	
+	    		 
+#ifdef _FMA
+		    __m256d ev = _mm256_load_pd(&extEV[l * 20 + 0]);
+		    vv[0] = FMAMACC(vv[0],x1px2v, ev);
+#else
+		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 0])));
+#endif
+		    _mm256_store_pd(&v[0],vv[0]);
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 4]);
+		    vv[1] = FMAMACC(vv[1],x1px2v, ev);
+#else
+		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 4])));
+#endif
+		    _mm256_store_pd(&v[4],vv[1]);
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 8]);
+		    vv[2] = FMAMACC(vv[2],x1px2v, ev);
+#else
+		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 8])));
+#endif
+		    _mm256_store_pd(&v[8],vv[2]);
+		    
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 12]);
+		    vv[3] = FMAMACC(vv[3],x1px2v, ev);
+#else
+		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 12])));
+#endif
+		    _mm256_store_pd(&v[12],vv[3]);
+
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 16]);
+		    vv[4] = FMAMACC(vv[4],x1px2v, ev);
+#else
+		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 16])));
+#endif
+		    _mm256_store_pd(&v[16],vv[4]);
+
+		  } 
+	      }
+	   
+	    v = x3_gapColumn;
+	    __m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);
+	    scale = 1;
+	    for(l = 0; scale && (l < 80); l += 4) 
+	      {
+		__m256d vv = _mm256_load_pd(&v[l]);
+		__m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+		vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+		if(_mm256_movemask_pd(vv_abs) != 15)
+		  scale = 0;
+	      }
+	    
+	    if(scale) 
+	      {		
+		__m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+		gapScaling = 1;
+
+		for(l = 0; l < 80; l += 4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+		    _mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+		  }	
+	      } 
+	}       
+	
+	for (i = 0; i < n; i++) 
+	  {	   
+	    if((x3_gap[i / 32] & mask32[i % 32]))
+	      {	       
+		if(gapScaling)
+		  {
+		    if(useFastScaling)
+		      addScale += wgt[i];
+		    else
+		      ex3[i]  += 1;
+		  }
+	      }
+	    else
+	      {		
+		uX1 = &umpX1[80 * tipX1[i]];
+		
+		if(x2_gap[i / 32] & mask32[i % 32])
+		  x2 = x2_gapColumn;
+		else
+		  {
+		    x2 = x2_ptr;
+		    x2_ptr += 80;
+		  }	      
+	    
+		for(k = 0; k < 4; k++) 
+		  {
+		    v = &(x2[k * 20]);
+		    
+		    for(l = 0; l < 20; l++) 
+		      {
+			__m256d ump_x2v = _mm256_setzero_pd();
+		    	
+			__m256d vv = _mm256_load_pd(&v[0]);
+			__m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
+#ifdef _FMA
+			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+			
+			vv = _mm256_load_pd(&v[4]);
+			rightv = _mm256_load_pd(&right[k*400+l*20+4]);
+#ifdef _FMA
+			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+			
+			vv = _mm256_load_pd(&v[8]);
+			rightv = _mm256_load_pd(&right[k*400+l*20+8]);
+#ifdef _FMA
+			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+			
+			vv = _mm256_load_pd(&v[12]);
+			rightv = _mm256_load_pd(&right[k*400+l*20+12]);
+#ifdef _FMA
+			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+			
+			vv = _mm256_load_pd(&v[16]);
+			rightv = _mm256_load_pd(&right[k*400+l*20+16]);
+#ifdef _FMA
+			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+			
+			ump_x2v = hadd3(ump_x2v);
+			_mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+		      }
+		  
+		    
+		    v = &x3_ptr[k * 20];
+		    
+		    __m256d vv[5]; 
+		    
+		    vv[0] = _mm256_setzero_pd();
+		    vv[1] = _mm256_setzero_pd();
+		    vv[2] = _mm256_setzero_pd();
+		    vv[3] = _mm256_setzero_pd();
+		    vv[4] = _mm256_setzero_pd();
+		    
+		    for(l = 0; l < 20; l++) 
+		      {
+			x1px2 = uX1[k * 20 + l]	* ump_x2[l];
+			__m256d x1px2v = _mm256_set1_pd(x1px2);	
+			
+#ifdef _FMA
+			__m256d ev = _mm256_load_pd(&extEV[l * 20 + 0]);
+			vv[0] = FMAMACC(vv[0],x1px2v, ev);
+#else
+			vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 0])));
+#endif
+			_mm256_store_pd(&v[0],vv[0]);
+			
+#ifdef _FMA
+			ev = _mm256_load_pd(&extEV[l * 20 + 4]);
+			vv[1] = FMAMACC(vv[1],x1px2v, ev);
+#else
+			vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 4])));
+#endif
+			_mm256_store_pd(&v[4],vv[1]);
+			
+#ifdef _FMA
+			ev = _mm256_load_pd(&extEV[l * 20 + 8]);
+			vv[2] = FMAMACC(vv[2],x1px2v, ev);
+#else
+			vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 8])));
+#endif
+			_mm256_store_pd(&v[8],vv[2]);
+			
+#ifdef _FMA
+			ev = _mm256_load_pd(&extEV[l * 20 + 12]);
+			vv[3] = FMAMACC(vv[3],x1px2v, ev);
+#else
+			vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 12])));
+#endif
+			_mm256_store_pd(&v[12],vv[3]);
+			
+			
+#ifdef _FMA
+			ev = _mm256_load_pd(&extEV[l * 20 + 16]);
+			vv[4] = FMAMACC(vv[4],x1px2v, ev);
+#else
+			vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 16])));
+#endif
+			_mm256_store_pd(&v[16],vv[4]);
+			
+		      } 
+		  }
+		
+		v = x3_ptr;
+		__m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);
+		scale = 1;
+		for(l = 0; scale && (l < 80); l += 4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+		    __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+		    vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+		    if(_mm256_movemask_pd(vv_abs) != 15)
+		      scale = 0;
+		  }
+	    
+		if(scale) 
+		  {		
+		    __m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+		    for(l = 0; l < 80; l += 4) 
+		      {
+			__m256d vv = _mm256_load_pd(&v[l]);
+			_mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+		      }
+		    if(useFastScaling)
+		      addScale += wgt[i];				
+		    else
+		      ex3[i] += 1;
+		  }	      
+		x3_ptr += 80;
+	      }
+	  }
+      }
+      break;
+    case PLL_INNER_INNER:    	  
+      for(k = 0; k < 4; k++) 
+	{
+	  vl = &(x1_gapColumn[20 * k]);
+	  vr = &(x2_gapColumn[20 * k]);
+	  v  = &(x3_gapColumn[20 * k]);	      	   
+
+	  __m256d vv[5]; 
+	  
+	  vv[0] = _mm256_setzero_pd();
+	  vv[1] = _mm256_setzero_pd();
+	  vv[2] = _mm256_setzero_pd();
+	  vv[3] = _mm256_setzero_pd();
+	  vv[4] = _mm256_setzero_pd();
+	  
+	  for(l = 0; l < 20; l++) 
+	    {		  
+	      __m256d al = _mm256_setzero_pd();
+	      __m256d ar = _mm256_setzero_pd();
+	      
+	      __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+	      __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+	      __m256d vlv = _mm256_load_pd(&vl[0]);
+	      __m256d vrv = _mm256_load_pd(&vr[0]);
+	      
+#ifdef _FMA
+	      
+	      al = FMAMACC(al, vlv, leftv);
+	      ar = FMAMACC(ar, vrv, rightv);
+#else
+	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+#endif
+	      
+	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+	      vlv = _mm256_load_pd(&vl[4]);
+	      vrv = _mm256_load_pd(&vr[4]);
+#ifdef _FMA
+	      
+	      al = FMAMACC(al, vlv, leftv);
+	      ar = FMAMACC(ar, vrv, rightv);
+#else
+	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+	      
+	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+	      vlv = _mm256_load_pd(&vl[8]);
+	      vrv = _mm256_load_pd(&vr[8]);
+#ifdef _FMA
+	      
+	      al = FMAMACC(al, vlv, leftv);
+	      ar = FMAMACC(ar, vrv, rightv);
+#else
+	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+	      
+	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+	      vlv = _mm256_load_pd(&vl[12]);
+	      vrv = _mm256_load_pd(&vr[12]);
+#ifdef _FMA
+	      
+	      al = FMAMACC(al, vlv, leftv);
+	      ar = FMAMACC(ar, vrv, rightv);
+#else
+	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+	      
+	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+	      vlv = _mm256_load_pd(&vl[16]);
+	      vrv = _mm256_load_pd(&vr[16]);
+	      
+#ifdef _FMA		    
+	      al = FMAMACC(al, vlv, leftv);
+	      ar = FMAMACC(ar, vrv, rightv);
+#else
+	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+	      
+	      /**************************************************************************************************************/
+	      
+	      al = hadd3(al);
+	      ar = hadd3(ar);
+	      al = _mm256_mul_pd(ar,al);
+	      
+	      /************************************************************************************************************/
+#ifdef _FMA		    
+	      __m256d ev =  _mm256_load_pd(&extEV[20 * l + 0]);
+	      vv[0] = FMAMACC(vv[0], al, ev);		 
+#else
+	      vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
+#endif
+	      _mm256_store_pd(&v[0],vv[0]);
+	      
+#ifdef _FMA		    
+	      ev =  _mm256_load_pd(&extEV[20 * l + 4]);
+	      vv[1] = FMAMACC(vv[1], al, ev);		 
+#else
+	      vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 4])));		  		 
+#endif
+	      _mm256_store_pd(&v[4],vv[1]);
+	      
+#ifdef _FMA		    
+	      ev =  _mm256_load_pd(&extEV[20 * l + 8]);
+	      vv[2] = FMAMACC(vv[2], al, ev);		 
+#else
+	      vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 8])));		  		 
+#endif
+	      _mm256_store_pd(&v[8],vv[2]);
+	      
+#ifdef _FMA		    
+	      ev =  _mm256_load_pd(&extEV[20 * l + 12]);
+	      vv[3] = FMAMACC(vv[3], al, ev);		 
+#else
+	      vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 12])));		  		 
+#endif
+	      _mm256_store_pd(&v[12],vv[3]);
+	      
+#ifdef _FMA		    
+	      ev =  _mm256_load_pd(&extEV[20 * l + 16]);
+	      vv[4] = FMAMACC(vv[4], al, ev);		 
+#else
+	      vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 16])));			 	  
+#endif
+	      _mm256_store_pd(&v[16],vv[4]);		 
+	    } 
+	}
+	
+      v = x3_gapColumn;
+      scale = 1;
+      __m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);	 
+      
+      for(l = 0; scale && (l < 80); l += 4) 
+	{
+	  __m256d vv = _mm256_load_pd(&v[l]);
+	  __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+	  vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+	  if(_mm256_movemask_pd(vv_abs) != 15)
+	    scale = 0;	     
+	}
+
+      if(scale) 
+	{		     	      
+	  __m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+	  gapScaling = 1;
+
+	  for(l = 0; l < 80; l += 4) 
+	    {
+	      __m256d vv = _mm256_load_pd(&v[l]);
+	      _mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+	    }
+	  
+	} 
+   
+     
+
+      for(i = 0; i < n; i++) 
+	{   
+	  
+	  if(x3_gap[i / 32] & mask32[i % 32])
+	    {	     
+	      if(gapScaling)
+		{
+		  if(useFastScaling)
+		    addScale += wgt[i];
+		  else
+		    ex3[i]  += 1; 	       
+		}
+	    }
+	  else
+	    {
+	      if(x1_gap[i / 32] & mask32[i % 32])
+		x1 = x1_gapColumn;
+	      else
+		{
+		  x1 = x1_ptr;
+		  x1_ptr += 80;
+		}
+
+	      if(x2_gap[i / 32] & mask32[i % 32])
+		x2 = x2_gapColumn;
+	      else
+		{
+		  x2 = x2_ptr;
+		  x2_ptr += 80;
+		}	   
+	  
+	      for(k = 0; k < 4; k++) 
+		{
+		  vl = &(x1[20 * k]);
+		  vr = &(x2[20 * k]);
+		  v  = &(x3_ptr[20 * k]);	      	   
+		  
+		  __m256d vv[5]; 
+		  
+		  vv[0] = _mm256_setzero_pd();
+		  vv[1] = _mm256_setzero_pd();
+		  vv[2] = _mm256_setzero_pd();
+		  vv[3] = _mm256_setzero_pd();
+		  vv[4] = _mm256_setzero_pd();
+		  
+		  for(l = 0; l < 20; l++) 
+		    {		  
+		      __m256d al = _mm256_setzero_pd();
+		      __m256d ar = _mm256_setzero_pd();
+		      
+		      __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+		      __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+		      __m256d vlv = _mm256_load_pd(&vl[0]);
+		      __m256d vrv = _mm256_load_pd(&vr[0]);
+		      
+#ifdef _FMA
+		      
+		      al = FMAMACC(al, vlv, leftv);
+		      ar = FMAMACC(ar, vrv, rightv);
+#else
+		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+#endif
+		      
+		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+		      vlv = _mm256_load_pd(&vl[4]);
+		      vrv = _mm256_load_pd(&vr[4]);
+#ifdef _FMA
+		      
+		      al = FMAMACC(al, vlv, leftv);
+		      ar = FMAMACC(ar, vrv, rightv);
+#else
+		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+		      
+		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+		      vlv = _mm256_load_pd(&vl[8]);
+		      vrv = _mm256_load_pd(&vr[8]);
+#ifdef _FMA
+		      
+		      al = FMAMACC(al, vlv, leftv);
+		      ar = FMAMACC(ar, vrv, rightv);
+#else
+		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+		      
+		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+		      vlv = _mm256_load_pd(&vl[12]);
+		      vrv = _mm256_load_pd(&vr[12]);
+#ifdef _FMA
+		      
+		      al = FMAMACC(al, vlv, leftv);
+		      ar = FMAMACC(ar, vrv, rightv);
+#else
+		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+		      
+		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+		      vlv = _mm256_load_pd(&vl[16]);
+		      vrv = _mm256_load_pd(&vr[16]);
+		      
+#ifdef _FMA		    
+		      al = FMAMACC(al, vlv, leftv);
+		      ar = FMAMACC(ar, vrv, rightv);
+#else
+		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+		      
+		      /**************************************************************************************************************/
+		      
+		      al = hadd3(al);
+		      ar = hadd3(ar);
+		      al = _mm256_mul_pd(ar,al);
+		      
+		      /************************************************************************************************************/
+#ifdef _FMA		    
+		      __m256d ev =  _mm256_load_pd(&extEV[20 * l + 0]);
+		      vv[0] = FMAMACC(vv[0], al, ev);		 
+#else
+		      vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
+#endif
+		      _mm256_store_pd(&v[0],vv[0]);
+		      
+#ifdef _FMA		    
+		      ev =  _mm256_load_pd(&extEV[20 * l + 4]);
+		      vv[1] = FMAMACC(vv[1], al, ev);		 
+#else
+		      vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 4])));		  		 
+#endif
+		      _mm256_store_pd(&v[4],vv[1]);
+		      
+#ifdef _FMA		    
+		      ev =  _mm256_load_pd(&extEV[20 * l + 8]);
+		      vv[2] = FMAMACC(vv[2], al, ev);		 
+#else
+		      vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 8])));		  		 
+#endif
+		      _mm256_store_pd(&v[8],vv[2]);
+		      
+#ifdef _FMA		    
+		      ev =  _mm256_load_pd(&extEV[20 * l + 12]);
+		      vv[3] = FMAMACC(vv[3], al, ev);		 
+#else
+		      vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 12])));		  		 
+#endif
+		      _mm256_store_pd(&v[12],vv[3]);
+		      
+#ifdef _FMA		    
+		      ev =  _mm256_load_pd(&extEV[20 * l + 16]);
+		      vv[4] = FMAMACC(vv[4], al, ev);		 
+#else
+		      vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 16])));			 	  
+#endif
+		      _mm256_store_pd(&v[16],vv[4]);		 
+		    }
+		}
+	      
+	      v = x3_ptr;
+	      scale = 1;
+	      
+	      __m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);	 
+	      
+	      for(l = 0; scale && (l < 80); l += 4) 
+		{
+		  __m256d vv = _mm256_load_pd(&v[l]);
+		  __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+		  vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+		  if(_mm256_movemask_pd(vv_abs) != 15)
+		    scale = 0;	     
+		}
+	      
+	      if(scale) 
+		{		     	      
+		  __m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+		  for(l = 0; l < 80; l += 4) 
+		    {
+		      __m256d vv = _mm256_load_pd(&v[l]);
+		      _mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+		    }
+		  if(useFastScaling)
+		    addScale += wgt[i];					
+		  else
+		    ex3[i] += 1;
+		}  
+	      x3_ptr += 80;
+	    }
+	}
+      break;
+    default:
+      assert(0);
+    }
+ 
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+}
diff --git a/pll/bipartitionList.c b/pll/bipartitionList.c
new file mode 100644
index 0000000..44c6888
--- /dev/null
+++ b/pll/bipartitionList.c
@@ -0,0 +1,434 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file bipartitionList.c
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32  
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h>  
+#endif
+
+#include <limits.h>
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+
+static pllBipartitionEntry *initEntry(void);
+static void getxnodeBips (nodeptr p);
+static void newviewBipartitions(unsigned int **bitVectors, 
+                                nodeptr p, 
+                                int numsp, 
+                                unsigned int vectorLength, 
+                                int processID);
+
+static void insertHashRF(unsigned int *bitVector, 
+                         pllHashTable *h, 
+                         unsigned int vectorLength, 
+                         int treeNumber, 
+                         int treeVectorLength, 
+                         hashNumberType position, 
+                         int support, 
+                         pllBoolean computeWRF);
+
+extern const unsigned int mask32[32];
+
+
+static void getxnodeBips (nodeptr p)
+{
+  nodeptr  s;
+
+  if ((s = p->next)->xBips || (s = s->next)->xBips)
+    {
+      p->xBips = s->xBips;
+      s->xBips = 0;
+    }
+
+  assert(p->xBips);
+}
+
+
+static pllBipartitionEntry *initEntry(void)
+{
+  pllBipartitionEntry * e = (pllBipartitionEntry *)rax_malloc(sizeof(pllBipartitionEntry));
+
+  e->bitVector     = (unsigned int*)NULL;
+  e->treeVector    = (unsigned int*)NULL;
+  e->supportVector = (int*)NULL;
+  e->bipNumber  = 0;
+  e->bipNumber2 = 0;
+  e->supportFromTreeset[0] = 0;
+  e->supportFromTreeset[1] = 0;
+  e->next       = (pllBipartitionEntry *)NULL;
+
+  return e;
+} 
+
+void cleanupHashTable(pllHashTable *h, int state)
+{
+  unsigned int
+    k,
+    entryCount = 0,
+    removeCount = 0;
+ 
+  assert(state == 1 || state == 0);
+
+  for(k = 0, entryCount = 0; k < h->size; k++)       
+    { 
+      pllHashItem * start     = NULL;
+      pllHashItem * lastValid = NULL;
+      
+      pllHashItem * hitem = h->Items[k];
+      while (hitem)
+       {                           
+         pllBipartitionEntry *e = (pllBipartitionEntry *)(hitem->data);
+         if(state == 0)
+           {
+             e->treeVector[0] = e->treeVector[0] & 2;      
+             assert(!(e->treeVector[0] & 1));
+           }
+         else
+           {
+             e->treeVector[0] = e->treeVector[0] & 1;
+             assert(!(e->treeVector[0] & 2));
+           }
+         
+         if(e->treeVector[0] != 0)
+           {
+             if(!start)
+               start = hitem;
+             lastValid = hitem;
+             hitem = hitem->next;
+           }         
+         else
+           {
+             pllHashItem *tmp = hitem;
+             pllBipartitionEntry *remove = e;
+             hitem = hitem->next;
+             
+             removeCount++;
+
+             if(lastValid) lastValid->next = hitem;
+
+             if(remove->bitVector)     rax_free(remove->bitVector);
+             if(remove->treeVector)    rax_free(remove->treeVector);
+             if(remove->supportVector) rax_free(remove->supportVector);
+             rax_free(remove);              
+             rax_free(tmp);
+           }
+         entryCount++;
+       }
+
+      if(!start)
+        {
+          assert(!lastValid);
+          h->Items[k] = NULL;
+        }
+      else
+        {
+          h->Items[k] = start;
+        }            
+    }
+
+  assert(entryCount ==  h->entries);
+  h->entries-= removeCount;
+}
+
+
+
+
+
+
+
+
+
+
+
+unsigned int **initBitVector(int mxtips, unsigned int *vectorLength)
+{
+  unsigned int 
+    **bitVectors = (unsigned int **)rax_malloc(sizeof(unsigned int*) * 2 * (size_t)mxtips);
+  
+  int 
+    i;
+
+  if(mxtips % PLL_MASK_LENGTH == 0)
+    *vectorLength = mxtips / PLL_MASK_LENGTH;
+  else
+    *vectorLength = 1 + (mxtips / PLL_MASK_LENGTH); 
+  
+  for(i = 1; i <= mxtips; i++)
+    {
+      bitVectors[i] = (unsigned int *)rax_calloc((size_t)(*vectorLength), sizeof(unsigned int));
+      assert(bitVectors[i]);
+      bitVectors[i][(i - 1) / PLL_MASK_LENGTH] |= mask32[(i - 1) % PLL_MASK_LENGTH];
+    }
+  
+  for(i = mxtips + 1; i < 2 * mxtips; i++) 
+    {
+      bitVectors[i] = (unsigned int *)rax_malloc(sizeof(unsigned int) * (size_t)(*vectorLength));
+      assert(bitVectors[i]);
+    }
+
+  return bitVectors;
+}
+
+void freeBitVectors(unsigned int **v, int n)
+{
+  int i;
+
+  for(i = 1; i < n; i++)
+    rax_free(v[i]);
+}
+
+
+static void newviewBipartitions(unsigned int **bitVectors, 
+                                nodeptr p, 
+                                int numsp, 
+                                unsigned int vectorLength, 
+                                int processID)
+{
+  
+  if(isTip(p->number, numsp))
+    return;
+  {
+    nodeptr 
+      q = p->next->back, 
+      r = p->next->next->back;
+    
+    
+    
+    unsigned int       
+      *vector = bitVectors[p->number],
+      *left  = bitVectors[q->number],
+      *right = bitVectors[r->number];
+    unsigned 
+      int i;      
+    
+    assert(processID == 0);
+    
+
+    while(!p->xBips)
+      { 
+        if(!p->xBips)
+          getxnodeBips(p);
+      }
+
+    p->hash = q->hash ^ r->hash;
+
+    if(isTip(q->number, numsp) && isTip(r->number, numsp))
+      {         
+        for(i = 0; i < vectorLength; i++)
+          vector[i] = left[i] | right[i];               
+      }
+    else
+      { 
+        if(isTip(q->number, numsp) || isTip(r->number, numsp))
+          {
+            if(isTip(r->number, numsp))
+              { 
+                nodeptr tmp = r;
+                r = q;
+                q = tmp;
+              }    
+                    
+            while(!r->xBips)
+              {
+                if(!r->xBips)
+                  newviewBipartitions(bitVectors, r, numsp, vectorLength, processID);
+              }    
+
+            for(i = 0; i < vectorLength; i++)
+              vector[i] = left[i] | right[i];            
+          }
+        else
+          {         
+            while((!r->xBips) || (!q->xBips))
+              {
+                if(!q->xBips)
+                  newviewBipartitions(bitVectors, q, numsp, vectorLength, processID);
+                if(!r->xBips)
+                  newviewBipartitions(bitVectors, r, numsp, vectorLength, processID);
+              }                                    
+
+            for(i = 0; i < vectorLength; i++)
+              vector[i] = left[i] | right[i];    
+          }
+
+      }     
+  }     
+}
+
+
+
+
+static void insertHashRF(unsigned int *bitVector, 
+                         pllHashTable *h, 
+                         unsigned int vectorLength, 
+                         int treeNumber, 
+                         int treeVectorLength, 
+                         hashNumberType position, 
+                         int support, 
+                         pllBoolean computeWRF)
+{
+  pllBipartitionEntry * e;
+  pllHashItem * hitem;
+
+  if(h->Items[position] != NULL)
+    {
+      for (hitem = h->Items[position]; hitem; hitem = hitem->next)
+        { 
+          e = (pllBipartitionEntry *)(hitem->data);
+          
+          if (!memcmp(bitVector, e->bitVector, vectorLength * sizeof(unsigned int)))
+            {
+              e->treeVector[treeNumber / PLL_MASK_LENGTH] |= mask32[treeNumber % PLL_MASK_LENGTH];
+              if(computeWRF)
+                {
+                  e->supportVector[treeNumber] = support;
+                  assert(0 <= treeNumber && treeNumber < treeVectorLength * PLL_MASK_LENGTH);
+                }
+              return;
+            }
+        }
+    }
+  e = initEntry(); 
+       
+  rax_posix_memalign ((void **)&(e->bitVector), PLL_BYTE_ALIGNMENT, (size_t)vectorLength * sizeof(unsigned int));
+  memset(e->bitVector, 0, vectorLength * sizeof(unsigned int));
+
+  e->treeVector = (unsigned int*)rax_calloc((size_t)treeVectorLength, sizeof(unsigned int));
+  if(computeWRF)
+    e->supportVector = (int*)rax_calloc((size_t)treeVectorLength * PLL_MASK_LENGTH, sizeof(int));
+
+  e->treeVector[treeNumber / PLL_MASK_LENGTH] |= mask32[treeNumber % PLL_MASK_LENGTH];
+  if(computeWRF)
+    {
+      e->supportVector[treeNumber] = support;
+     
+      assert(0 <= treeNumber && treeNumber < treeVectorLength * PLL_MASK_LENGTH);
+    }
+
+  memcpy(e->bitVector, bitVector, sizeof(unsigned int) * vectorLength);
+  
+  pllHashAdd (h, position, NULL, (void *)e);
+}
+
+
+
+void bitVectorInitravSpecial(unsigned int **bitVectors, nodeptr p, int numsp, unsigned int vectorLength, pllHashTable *h, int treeNumber, int function, branchInfo *bInf, 
+                             int *countBranches, int treeVectorLength, pllBoolean traverseOnly, pllBoolean computeWRF, int processID)
+{
+  if(isTip(p->number, numsp))
+    return;
+  else
+    {
+      nodeptr 
+        q = p->next;          
+
+      do 
+        {
+          bitVectorInitravSpecial(bitVectors, q->back, numsp, vectorLength, h, treeNumber, function, bInf, countBranches, treeVectorLength, traverseOnly, computeWRF, processID);
+          q = q->next;
+        }
+      while(q != p);
+           
+      newviewBipartitions(bitVectors, p, numsp, vectorLength, processID);
+      
+      assert(p->xBips);
+
+      assert(!traverseOnly);     
+
+      if(!(isTip(p->back->number, numsp)))
+        {
+          unsigned int 
+            *toInsert  = bitVectors[p->number];
+          
+          hashNumberType 
+            position = p->hash % h->size;
+         
+          assert(!(toInsert[0] & 1));
+          assert(!computeWRF);
+          
+          switch(function)
+            {        
+            case PLL_BIPARTITIONS_RF:        
+              insertHashRF(toInsert, h, vectorLength, treeNumber, treeVectorLength, position, 0, computeWRF);
+              *countBranches =  *countBranches + 1;
+              break;
+            default:
+              assert(0);
+            }             
+        }
+      
+    }
+}
+
+double convergenceCriterion(pllHashTable *h, int mxtips)
+{
+  int      
+    rf = 0; 
+
+  unsigned int 
+    k = 0, 
+    entryCount = 0;
+  
+  double    
+    rrf;  
+
+  pllHashItem * hitem;
+
+  for(k = 0, entryCount = 0; k < h->size; k++)          
+    {      
+      for (hitem = h->Items[k]; hitem; hitem = hitem->next)
+       {
+         pllBipartitionEntry *e = hitem->data;
+         unsigned int *vector = e->treeVector;          
+
+         if(((vector[0] & 1) > 0) + ((vector[0] & 2) > 0) == 1)
+           rf++;        
+          
+         entryCount++;
+         e = e->next;
+       }
+    }
+
+  assert(entryCount == h->entries);  
+  rrf = (double)rf/((double)(2 * (mxtips - 3)));  
+  return rrf;
+}
diff --git a/pll/cycle.h b/pll/cycle.h
new file mode 100644
index 0000000..889932a
--- /dev/null
+++ b/pll/cycle.h
@@ -0,0 +1,516 @@
+/*
+ * Copyright (c) 2003, 2007-8 Matteo Frigo
+ * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+
+/* machine-dependent cycle counters code. Needs to be inlined. */
+
+/***************************************************************************/
+/* To use the cycle counters in your code, simply #include "cycle.h" (this
+   file), and then use the functions/macros:
+
+                 ticks getticks(void);
+
+   ticks is an opaque typedef defined below, representing the current time.
+   You extract the elapsed time between two calls to gettick() via:
+
+                 double elapsed(ticks t1, ticks t0);
+
+   which returns a double-precision variable in arbitrary units.  You
+   are not expected to convert this into human units like seconds; it
+   is intended only for *comparisons* of time intervals.
+
+   (In order to use some of the OS-dependent timer routines like
+   Solaris' gethrtime, you need to paste the autoconf snippet below
+   into your configure.ac file and #include "config.h" before cycle.h,
+   or define the relevant macros manually if you are not using autoconf.)
+*/
+
+/***************************************************************************/
+/* This file uses macros like HAVE_GETHRTIME that are assumed to be
+   defined according to whether the corresponding function/type/header
+   is available on your system.  The necessary macros are most
+   conveniently defined if you are using GNU autoconf, via the tests:
+   
+   dnl ---------------------------------------------------------------------
+
+   AC_C_INLINE
+   AC_HEADER_TIME
+   AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h])
+
+   AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif])
+
+   AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time])
+
+   dnl Cray UNICOS _rtc() (real-time clock) intrinsic
+   AC_MSG_CHECKING([for _rtc intrinsic])
+   rtc_ok=yes
+   AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H
+#include <intrinsics.h>
+#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no])
+   AC_MSG_RESULT($rtc_ok)
+
+   dnl ---------------------------------------------------------------------
+*/
+
+/***************************************************************************/
+
+#ifdef TIME_WITH_SYS_TIME
+# include <sys/time.h>
+# include <time.h>
+#else
+# ifdef HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
+
+
+
+
+#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \
+{									  \
+     return (double)t1 - (double)t0;					  \
+}
+
+/*----------------------------------------------------------------*/
+/* Solaris */
+#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER)
+typedef hrtime_t ticks;
+
+#define getticks gethrtime
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* AIX v. 4+ routines to read the real-time clock or time-base register */
+#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER)
+typedef timebasestruct_t ticks;
+
+static __inline ticks getticks(void)
+{
+     ticks t;
+     read_real_time(&t, TIMEBASE_SZ);
+     return t;
+}
+
+static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */
+{
+     time_base_to_time(&t1, TIMEBASE_SZ);
+     time_base_to_time(&t0, TIMEBASE_SZ);
+     return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 + 
+	     ((double)t1.tb_low - (double)t0.tb_low));
+}
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PowerPC ``cycle'' counter using the time base register.
+ */
+#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__))))  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned int tbl, tbu0, tbu1;
+
+     do {
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+	  __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+     } while (tbu0 != tbu1);
+
+     return (((unsigned long long)tbu0) << 32) | tbl;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* MacOS/Mach (Darwin) time-base register interface (unlike UpTime,
+   from Carbon, requires no additional libraries to be linked). */
+#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER)
+#include <mach/mach_time.h>
+typedef uint64_t ticks;
+#define getticks mach_absolute_time
+INLINE_ELAPSED(__inline__)
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * Pentium cycle counter 
+ */
+#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("rdtsc": "=A" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#endif
+
+/* Visual C++ -- thanks to Morten Nissov for his help with this */
+#if defined(_MSC_VER) && _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER)
+#include <windows.h>
+typedef LARGE_INTEGER ticks;
+#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */
+
+static __inline ticks getticks(void)
+{
+     ticks retval;
+
+     __asm {
+	  RDTSC
+	  mov retval.HighPart, edx
+	  mov retval.LowPart, eax
+     }
+     return retval;
+}
+
+static __inline double elapsed(ticks t1, ticks t0)
+{  
+     return (double)t1.QuadPart - (double)t0.QuadPart;
+}  
+
+#define HAVE_TICK_COUNTER
+#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * X86-64 cycle counter
+ */
+#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned a, d; 
+     __asm volatile("rdtsc" : "=a" (a), "=d" (d)); 
+     return ((ticks)a) | (((ticks)d) << 32); 
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori.
+   NOTE: this code will fail to link unless you use the -Masmkeyword compiler
+   option (grrr). */
+#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) 
+typedef unsigned long long ticks;
+static ticks getticks(void)
+{
+    asm(" rdtsc; shl    $0x20,%rdx; mov    %eax,%eax; or     %rdx,%rax;    ");
+}
+INLINE_ELAPSED(__inline__)
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Visual C++, courtesy of Dirk Michaelis */
+#if defined(_MSC_VER) && _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER)
+
+#include <intrin.h>
+#pragma intrinsic(__rdtsc)
+typedef unsigned __int64 ticks;
+#define getticks __rdtsc
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * IA64 cycle counter
+ */
+
+/* intel's icc/ecc compiler */
+#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+#include <ia64intrin.h>
+
+static __inline__ ticks getticks(void)
+{
+     return __getReg(_IA64_REG_AR_ITC);
+}
+ 
+INLINE_ELAPSED(__inline__)
+ 
+#define HAVE_TICK_COUNTER
+#endif
+
+/* gcc */
+#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */
+#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER)
+#include <machine/sys/inline.h>
+typedef unsigned long ticks;
+
+static __inline ticks getticks(void)
+{
+     ticks ret;
+
+     ret = _Asm_mov_from_ar (_AREG_ITC);
+     return ret;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Microsoft Visual C++ */
+#if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned __int64 ticks;
+
+#  ifdef __cplusplus
+extern "C"
+#  endif
+ticks __getReg(int whichReg);
+#pragma intrinsic(__getReg)
+
+static __inline ticks getticks(void)
+{
+     volatile ticks temp;
+     temp = __getReg(3116);
+     return temp;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PA-RISC cycle counter 
+ */
+#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+#  ifdef __GNUC__
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("mfctl 16, %0": "=r" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+#  else
+#  include <machine/inline.h>
+static __inline unsigned long getticks(void)
+{
+     register ticks ret;
+     _MFCTL(16, ret);
+     return ret;
+}
+#  endif
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* S390, courtesy of James Treacy */
+#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks cycles;
+     __asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc");
+     return cycles;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER)
+/*
+ * The 32-bit cycle counter on alpha overflows pretty quickly, 
+ * unfortunately.  A 1GHz machine overflows in 4 seconds.
+ */
+typedef unsigned int ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned long cc;
+     __asm__ __volatile__ ("rpcc %0" : "=r"(cc));
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+     __asm__ __volatile__("rd %%tick, %0" : "=r" (ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER)
+#  include <c_asm.h>
+typedef unsigned int ticks;
+
+static __inline ticks getticks(void)
+{
+     unsigned long cc;
+     cc = asm("rpcc %v0");
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+/* SGI/Irix */
+#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER)
+typedef struct timespec ticks;
+
+static __inline ticks getticks(void)
+{
+     struct timespec t;
+     clock_gettime(CLOCK_SGI_CYCLE, &t);
+     return t;
+}
+
+static __inline double elapsed(ticks t1, ticks t0)
+{
+     return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 +
+	  ((double)t1.tv_nsec - (double)t0.tv_nsec);
+}
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* Cray UNICOS _rtc() intrinsic function */
+#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER)
+#ifdef HAVE_INTRINSICS_H
+#  include <intrinsics.h>
+#endif
+
+typedef long long ticks;
+
+#define getticks _rtc
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* MIPS ZBus */
+#ifdef HAVE_MIPS_ZBUS_TIMER
+#if defined(__mips__) && !defined(HAVE_TICK_COUNTER)
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+typedef uint64_t ticks;
+
+static __inline ticks getticks(void)
+{
+  static uint64_t* addr = 0;
+
+  if (addr == 0)
+  {
+    uint32_t rq_addr = 0x10030000;
+    int fd;
+    int pgsize;
+
+    pgsize = getpagesize();
+    fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0);
+    if (fd < 0) {
+      perror("open");
+      return NULL;
+    }
+    addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr);
+    close(fd);
+    if (addr == (uint64_t *)-1) {
+      perror("mmap");
+      return NULL;
+    }
+  }
+
+  return *addr;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+#endif /* HAVE_MIPS_ZBUS_TIMER */
diff --git a/pll/errcodes.h b/pll/errcodes.h
new file mode 100644
index 0000000..ce81e68
--- /dev/null
+++ b/pll/errcodes.h
@@ -0,0 +1,69 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file errcodes.h
+ */
+#ifndef ERRCODES_H
+#define ERRCODES_H
+
+#define PLL_ERROR_FILE_OPEN             1               /**< Error while opening file */
+#define PLL_ERROR_INVALID_FILETYPE      2               /**< Invalid fileType given at pllParseAlignmeFile */
+
+#define  PLL_NNI_P_TIP                  1 << 0          /**< Node p is a tip */
+#define  PLL_NNI_Q_TIP                  1 << 1          /**< Node p->back is a tip */
+
+#define  PLL_PARTITION_OUT_OF_BOUNDS    1 << 0      /**< Trying to access a partition index that is out of bounds */
+#define  PLL_BASE_FREQUENCIES_DO_NOT_SUM_TO_1 1 << 1      /**< base frequencies don't sum to 1.0 */
+
+#define PLL_LINKAGE_LIST_OUT_OF_BOUNDS 1 << 0      /**< trying to link a partition index that is out of bounds */
+
+#define PLL_SUBSTITUTION_RATE_OUT_OF_BOUNDS 1 << 0 /**< trying  to set a substitution rate to a value that is out of bounds */
+#define PLL_INVALID_Q_MATRIX_SYMMETRY       1 << 1 /**< specifyng an invalid parameter symmetry in the Q matrix */
+#define PLL_Q_MATRIX_SYMMETRY_OUT_OF_BOUNDS 1 << 2 /**<specifying a Q matrix symmetry that is out of bounds */
+
+#define PLL_UNKNOWN_MOLECULAR_DATA_TYPE 1 << 0 /**<PLL is trying to do something for an unknown data type */
+
+#define PLL_INCONSISTENT_SUBST_RATE_OPTIMIZATION_SETTING 1 << 0 /**<PLL detected an inconsistent setting for the Q matrix rate optimization */
+#define PLL_INCONSISTENT_Q_MATRIX_SYMMETRIES_ACROSS_LINKED_PARTITIONS 1 << 1 /**<Q matrix symmetry vector is not identical for linked partitions */
+#define PLL_INCONSISTENT_Q_MATRIX_ENTRIES_ACROSS_LINKED_PARTITIONS 1 << 2 /**<Q matrix entries are not identical for linked partitions */
+#define PLL_INCONSISTENT_ALPHA_STATES_ACROSS_LINKED_PARTITIONS 1 << 3 /**<alpha states are not identical across linked partitions */
+#define PLL_INCONSISTENT_ALPHA_VALUES_ACROSS_LINKED_PARTITIONS 1 << 4 /**<alpha values are not identical across linked partitions */
+#define PLL_INCONSISTENT_FREQUENCY_STATES_ACROSS_LINKED_PARTITIONS 1 << 5 /**<frequency states are not identical across linked partitions */
+#define PLL_INCONSISTENT_FREQUENCY_VALUES_ACROSS_LINKED_PARTITIONS 1 << 6 /**<frequency values are not identical across linked partitions */
+
+#define PLL_NEWICK_ROOTED_TREE          1 << 0          /**< @brief Binary root detected */
+#define PLL_NEWICK_BAD_STRUCTURE        1 << 1          /**< @brief Errornous tree detected */
+
+
+
+#define PLL_ERROR_PHYLIP_HEADER_SYNTAX         5
+#define PLL_ERROR_PHYLIP_BODY_SYNTAX           6
+#define PLL_ERROR_FASTA_SYNTAX                 7
+
+
+
+
+#endif
diff --git a/pll/evaluateGenericSpecial.c b/pll/evaluateGenericSpecial.c
new file mode 100644
index 0000000..9a0dfc8
--- /dev/null
+++ b/pll/evaluateGenericSpecial.c
@@ -0,0 +1,3321 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file evaluateGenericSpecial.c
+ *   
+ * @brief Functions for computing the log likelihood at a given branch of the tree (i.e. a virtual root that is placed at this branch)
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32 
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+#ifdef __MIC_NATIVE
+#include "mic_native.h"
+#endif
+
+/* the set of functions in here computes the log likelihood at a given branch (the virtual root of a tree) */
+
+/* includes for using SSE3 intrinsics */
+
+#ifdef __SSE3
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+/*#include <tmmintrin.h>*/
+#endif
+
+
+/** @defgroup evaluateLikelihoodGroup Likelihood evaluation
+    
+    This set of functions deals with the evaluation of likelihood for the current topology
+*/
+
+
+
+
+
+
+
+/* below are the function headers for unreadeble highly optimized versions of the above functions 
+   for DNA and protein data that also use SSE3 intrinsics and implement some memory saving tricks.
+   The actual functions can be found at the end of this source file. 
+   All other likelihood function implementation files:
+
+   newviewGenericSpacial.c
+   makenewzSpecial.c
+   evaluatePartialGenericSpecial.c
+
+   are also structured like this 
+
+   To decide which set of function implementations to use you will have to undefine or define _OPTIMIZED_FUNCTIONS 
+   in the Makefile 
+   */
+#if (defined(__SSE3) || defined(__AVX))
+
+static double evaluateGTRGAMMAPROT_LG4(int *ex1, int *ex2, int *wptr,
+                                       double *x1, double *x2,  
+                                       double *tipVector[4], 
+                                       unsigned char *tipX1, int n, double *diagptable, const pllBoolean fastScaling,
+                                       double * lg4_weights);
+
+/* GAMMA for proteins with memory saving */
+
+static double evaluateGTRGAMMAPROT_GAPPED_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                                double *x1, double *x2,  
+                                                double *tipVector, 
+                                                unsigned char *tipX1, int n, double *diagptable, 
+                                                double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+
+/* GAMMA for proteins */
+
+static double evaluateGTRGAMMAPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                    double *x1, double *x2,  
+                                    double *tipVector, 
+                                    unsigned char *tipX1, int n, double *diagptable);
+
+/* CAT for proteins */
+
+static double evaluateGTRCATPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                  double *x1, double *x2, double *tipVector,
+                                  unsigned char *tipX1, int n, double *diagptable_start);
+
+
+/* CAT for proteins with memory saving */
+
+static double evaluateGTRCATPROT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                       double *x1, double *x2, double *tipVector,
+                                       unsigned char *tipX1, int n, double *diagptable_start, 
+                                       double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+/* analogous DNA fuctions */
+
+static double evaluateGTRCAT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                   double *x1_start, double *x2_start, double *tipVector,                     
+                                   unsigned char *tipX1, int n, double *diagptable_start,
+                                   double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+static double evaluateGTRGAMMA_GAPPED_SAVE(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                           double *x1_start, double *x2_start, 
+                                           double *tipVector, 
+                                           unsigned char *tipX1, const int n, double *diagptable,
+                                           double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+static double evaluateGTRGAMMA(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                               double *x1_start, double *x2_start, 
+                               double *tipVector, 
+                               unsigned char *tipX1, const int n, double *diagptable);
+
+
+static double evaluateGTRCAT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                              double *x1_start, double *x2_start, double *tipVector,                  
+                              unsigned char *tipX1, int n, double *diagptable_start);
+
+
+#endif
+
+#if (defined(__AVX) || defined(__SSE3))
+static double evaluateGTRGAMMA_BINARY(int *ex1, int *ex2, int *wptr,
+                                      double *x1_start, double *x2_start, 
+                                      double *tipVector, 
+                                      unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling);
+
+static double evaluateGTRCAT_BINARY (int *ex1, int *ex2, int *cptr, int *wptr,
+                                     double *x1_start, double *x2_start, double *tipVector,                   
+                                     unsigned char *tipX1, int n, double *diagptable_start, const pllBoolean fastScaling);
+#endif
+
+
+/* 
+   global variables of pthreads version, reductionBuffer is the global array 
+   that is used for implementing deterministic reduction operations, that is,
+   the total log likelihood over the partial log lieklihoods for the sites that each thread has computed 
+
+   NumberOfThreads is just the number of threads.
+
+   Note the volatile modifier here, that guarantees that the compiler will not do weird optimizations 
+   rearraengements of the code accessing those variables, because it does not know that several concurrent threads 
+   will access those variables simulatenously 
+
+   UPDATE: reductionBuffer is now merged with globalResult
+   */
+
+
+/* a pre-computed 32-bit integer mask */
+
+extern const unsigned int mask32[32];
+
+/* the function below computes the P matrix from the decomposition of the Q matrix and the respective rate categories for a single partition */
+
+/** @brief Compute the diagonal of P matrix for a specific edge
+
+    This function computes the diagonal of P matrix for a branch of length \a z
+    from the decomposition of the Q matrix specified in \a EIGN and the respective
+    rate categories \a rptr for a single partition. The diagonal is then stored in
+    \a diagptable. 
+
+    @param z                  Length of edge
+    @param states             Number of states
+    @param numberOfCategories Number of categories in the rate heterogeneity rate arrays
+    @param rptr               Rate heterogeneity rate arrays
+    @param EIGN               Eigenvalues
+    @param diagptable         Where to store the resulting P matrix
+*/
+static void calcDiagptable(const double z, const int states, const int numberOfCategories, const double *rptr, const double *EIGN, double *diagptable)
+{
+  int 
+    i, 
+    l;
+
+  double 
+    lz,
+    *lza = (double *)rax_malloc(sizeof(double) * states);
+
+  /* transform the root branch length to the log and check if it is not too small */
+
+  if (z < PLL_ZMIN) 
+    lz = log(PLL_ZMIN);
+  else
+    lz = log(z);
+
+  /* do some pre-computations to avoid redundant computations further below */
+
+  for(i = 1; i < states; i++)      
+    lza[i] = EIGN[i] * lz; 
+
+  /* loop over the number of per-site or discrete gamma rate categories */
+
+  for(i = 0; i < numberOfCategories; i++)
+  {                    
+    /* 
+       diagptable is a pre-allocated array of doubles that stores the P-Matrix 
+       the first entry is always 1.0 
+       */
+    diagptable[i * states] = 1.0;
+
+    /* compute the P matrix for all remaining states of the model */
+
+    for(l = 1; l < states; l++)
+      diagptable[i * states + l] = exp(rptr[i] * lza[l]);
+  }
+
+  rax_free(lza);
+}
+
+/** @brief Compute the diagonal of P matrix for a specific edge for the LG4 model
+
+    This function computes the diagonal of P matrix for a branch of length \a z
+    from the decomposition of the 4 LG4 Q matrices specified in \a EIGN and the respective
+    rate categories \a rptr for a single partition. The diagonal is then stored in
+    \a diagptable. 
+
+    @param z
+      Length of edge
+
+    @param states
+      Number of states
+
+    @param numberOfCategories
+      Number of categories in the rate heterogeneity rate arrays
+
+    @param rptr
+      Rate heterogeneity rate arrays
+
+    @param EIGN
+      Eigenvalues of the 4 Q matrices
+
+    @param diagptable
+      Where to store the resulting P matrix
+
+    @param numStates
+      Number of states
+*/
+static void calcDiagptableFlex_LG4(double z, int numberOfCategories, double *rptr, double *EIGN[4], double *diagptable, const int numStates)
+{
+  int 
+    i, 
+    l;
+  
+  double 
+    lz;
+  
+  assert(numStates <= 64);
+  
+  if (z < PLL_ZMIN) 
+    lz = log(PLL_ZMIN);
+  else
+    lz = log(z);
+
+  for(i = 0; i <  numberOfCategories; i++)
+    {                  
+      diagptable[i * numStates + 0] = 1.0;
+
+      for(l = 1; l < numStates; l++)
+        diagptable[i * numStates + l] = exp(rptr[i] * EIGN[i][l] * lz);                   
+    }        
+}
+
+static void ascertainmentBiasSequence(unsigned char tip[32], int numStates)
+{ 
+  assert(numStates <= 32 && numStates > 1);
+
+  switch(numStates)
+    {
+    case 2:     
+      tip[0] = 1;
+      tip[1] = 2;
+      break;
+    case 4:
+      tip[0] = 1;
+      tip[1] = 2;
+      tip[2] = 4;
+      tip[3] = 8;
+      break;
+    default:
+      {
+	int 
+	  i;
+	for(i = 0; i < numStates; i++)
+	  {
+	    tip[i] = i;
+	    //printf("%c ", inverseMeaningPROT[i]);
+	  }
+	//printf("\n");
+      }
+      break;
+    }
+}
+
+static double evaluateCatAsc(int *ex1, int *ex2,
+			     double *x1, double *x2,  
+			     double *tipVector, 
+			     unsigned char *tipX1, int n, double *diagptable, const int numStates)
+{
+  double
+    exponent,
+    sum = 0.0, 
+    unobserved,
+    term,
+    *left, 
+    *right;
+  
+  int     
+    i,    
+    l;   
+         
+  unsigned char 
+    tip[32];
+
+  ascertainmentBiasSequence(tip, numStates);
+   
+  if(tipX1)
+    {               
+      for (i = 0; i < n; i++) 
+	{
+	  left = &(tipVector[numStates * tip[i]]);	  	  
+	  right = &(x2[i * numStates]);
+
+	  term = 0.0;
+	         	      
+	  for(l = 0; l < numStates; l++)
+	    term += left[l] * right[l] * diagptable[l];	      	 	 	  	 
+
+	  /* assumes that pow behaves as expected/specified for underflows
+	     from the man page:
+	       If result underflows, and is not representable,
+	       a range error occurs and 0.0 is returned.
+	 */
+
+	  exponent = pow(PLL_MINLIKELIHOOD, (double)ex2[i]);
+
+	  unobserved = fabs(term) * exponent;
+
+#ifdef _DEBUG_ASC
+	  if(ex2[i] > 0)
+	    {
+	      printf("s %d\n", ex2[i]);
+	      assert(0);
+	    }
+#endif	  
+	    
+	  sum += unobserved;
+	}              
+    }              
+  else
+    {           
+      for (i = 0; i < n; i++) 
+	{	  	 
+	  term = 0.0;
+	  	 
+	  left  = &(x1[i * numStates]);
+	  right = &(x2[i * numStates]);	    
+	      
+	  for(l = 0; l < numStates; l++)
+	    term += left[l] * right[l] * diagptable[l];		  
+	  
+	  /* assumes that pow behaves as expected/specified for underflows
+	     from the man page:
+	       If result underflows, and is not representable,
+	       a range error occurs and 0.0 is returned.
+	  */
+
+	  exponent = pow(PLL_MINLIKELIHOOD, (double)(ex1[i] + ex2[i]));
+
+	  unobserved = fabs(term) * exponent;
+	  
+#ifdef _DEBUG_ASC
+	  if(ex2[i] > 0 || ex1[i] > 0)
+	    {
+	      printf("s %d %d\n", ex1[i], ex2[i]);
+	      assert(0);
+	    }
+#endif
+
+	  sum += unobserved;
+	}             
+    }        
+
+  return  sum;
+}
+
+
+static double evaluateGammaAsc(int *ex1, int *ex2,
+				double *x1, double *x2,  
+				double *tipVector, 
+				unsigned char *tipX1, int n, double *diagptable, const int numStates)
+{
+  double
+    exponent,
+    sum = 0.0, 
+    unobserved,
+    term,
+    *left, 
+    *right;
+  
+  int     
+    i, 
+    j, 
+    l;   
+  
+  const int 
+    gammaStates = numStates * 4;
+         
+  unsigned char 
+    tip[32];
+
+  ascertainmentBiasSequence(tip, numStates);
+   
+  if(tipX1)
+    {               
+      for (i = 0; i < n; i++) 
+	{
+	  left = &(tipVector[numStates * tip[i]]);	  	  
+	  
+	  for(j = 0, term = 0.0; j < 4; j++)
+	    {
+	      right = &(x2[gammaStates * i + numStates * j]);
+	      
+	      for(l = 0; l < numStates; l++)
+		term += left[l] * right[l] * diagptable[j * numStates + l];	      
+	    }	 	  	 
+
+      /* assumes that pow behaves as expected/specified for underflows
+         from the man page:
+           If result underflows, and is not representable,
+           a range error occurs and 0.0 is returned.
+      */
+
+      exponent = pow(PLL_MINLIKELIHOOD, (double)ex2[i]);
+
+      unobserved = fabs(term) * exponent;
+	  
+#ifdef _DEBUG_ASC
+	  if(ex2[i] > 0)
+	    {
+	      printf("s %d\n", ex2[i]);
+	      assert(0);
+	    }
+#endif	  
+	    
+	  sum += unobserved;
+	}              
+    }              
+  else
+    {           
+      for (i = 0; i < n; i++) 
+	{	  	 	             
+	  
+	  for(j = 0, term = 0.0; j < 4; j++)
+	    {
+	      left  = &(x1[gammaStates * i + numStates * j]);
+	      right = &(x2[gammaStates * i + numStates * j]);	    
+	      
+	      for(l = 0; l < numStates; l++)
+		term += left[l] * right[l] * diagptable[j * numStates + l];	
+	    }
+	  
+	  /* assumes that pow behaves as expected/specified for underflows
+	     from the man page:
+	       If result underflows, and is not representable,
+	       a range error occurs and 0.0 is returned.
+	  */
+
+	  exponent = pow(PLL_MINLIKELIHOOD, (double)(ex1[i] + ex2[i]));
+
+	  unobserved = fabs(term) * exponent;
+	  
+#ifdef _DEBUG_ASC
+	  if(ex2[i] > 0 || ex1[i] > 0)
+	    {
+	      printf("s %d %d\n", ex1[i], ex2[i]);
+	      assert(0);
+	    }
+#endif
+
+	  sum += unobserved;
+	}             
+    }        
+
+  return  sum;
+}
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief A generic (and slow) implementation of log likelihood evaluation of a tree using the GAMMA model of rate heterogeneity
+    
+    Computes the log likelihood of the topology for a specific partition, assuming
+    that the GAMMA model of rate heterogeneity is used. The likelihood is computed at
+    a virtual root placed at an edge whose two end-points (nodes) have the conditional
+    likelihood vectors \a x1 and \a x2. 
+    Furthermore, if \a getPerSiteLikelihoods is set to \b PLL_TRUE, then the log
+    likelihood for each site is also computed and stored at the corresponding position
+    in the array \a perSiteLikelihoods.
+
+    @param fastScaling
+      If set to \b PLL_FALSE, then the likelihood of each site is also multiplied by \a log(PLL_MINLIKELIHOOD) times the number
+      of times it has been scaled down
+
+    @param ex1
+      An array that holds how many times a site has been scaled and points at the entries for node \a p. This
+      parameter is used if \a fastScaling is set to \b PLL_FALSE.
+
+    @param ex2
+      An array that holds how many times a site has been scaled and points at the entries for node \a q. This
+      parameter is used if \a fastScaling is set to \b PLL_TRUE.
+
+    @param wptr
+      Array holding the weight for each site in the compressed partition alignment
+
+    @param x1_start
+      Conditional likelihood vectors for one of the two end-points of the specific edge for which we are evaluating the likelihood
+
+    @param x2_start
+      Conditional likelihood vectors for the other end-point of the specific edge for which we are evaluating the likelihood
+
+    @param tipVector
+      Precomputed table where the number of rows is equal to the number of possible basepair characters for the current data 
+      type, i.e.16 for DNA and 23 for AA, and each rows contains \a states elements each of which contains transition
+      probabilities computed from the eigenvectors of the decomposed Q matrix.
+
+    @param tipX1
+      If one of the two end-points (nodes) of the specific edge (for which we are evaluating the likelihood) is a tip, then
+      this holds a pointer to the sequence data (basepairs) already converted in the internal integer representation, and \a x2
+      holds the conditional likelihood vectors for the internal node.
+
+    @param n
+      Number of sites for which we are doing the evaluation. For the single-thread version this is the 
+      number of sites in the current partition, for multi-threads this is the number of sites assigned
+      to the running thread from the current partition.
+
+    @param diagptable
+      Start of the array that contains the P-Matrix diagonal of the specific edge for which we are
+      evaluating the likehood, and for each category of the GAMMA model
+
+    @param states
+      Number of states (4 for DNA, 20 for AA)
+
+    @param perSiteLikelihoods
+      Array to store per-site log likelihoods if \a getPerSiteLikelihoods is set to \b PLL_TRUE
+
+    @param getPerSiteLikelihoods
+      If set to \b PLL_TRUE then per-site log likelihoods are also computed and stored in \a perSiteLikelihoods
+
+    @return
+      The evaluated log likelihood of the tree topology
+*/
+static double evaluateGAMMA_FLEX(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                 double *x1_start, double *x2_start, 
+                                 double *tipVector, 
+                                 unsigned char *tipX1, const int n, double *diagptable, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods)
+{
+  double   
+    sum = 0.0, 
+    term,
+    *x1,
+    *x2;
+
+  int     
+    i, 
+    j,
+    k;
+
+  /* span is the offset within the likelihood array at an inner node that gets us from the values 
+     of site i to the values of site i + 1 */
+
+  const int 
+    span = states * 4;
+
+
+  /* we distingusih between two cases here: one node of the two nodes defining the branch at which we put the virtual root is 
+     a tip. Both nodes can not be tips because we do not allow for two-taxon trees ;-) 
+     Nota that, if a node is a tip, this will always be tipX1. This is done for code simplicity and the flipping of the nodes
+     is done before when we compute the traversal descriptor.     
+     */
+
+  /* the left node is a tip */
+  if(tipX1)
+  {             
+    /* loop over the sites of this partition */
+    for (i = 0; i < n; i++)
+    {
+      /* access pre-computed tip vector values via a lookup table */
+      x1 = &(tipVector[states * tipX1[i]]);      
+      /* access the other(inner) node at the other end of the branch */
+      x2 = &(x2_start[span * i]);        
+
+      /* loop over GAMMA rate categories, hard-coded as 4 in RAxML */
+      for(j = 0, term = 0.0; j < 4; j++)
+        /* loop over states and multiply them with the P matrix */
+        for(k = 0; k < states; k++)
+          term += x1[k] * x2[j * states + k] * diagptable[j * states + k];                                                        
+
+      /* take the log of the likelihood and multiply the per-gamma rate likelihood by 1/4.
+         Under the GAMMA model the 4 discrete GAMMA rates all have the same probability 
+         of 0.25 */
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));
+
+      /* if required get the per-site log likelihoods.
+         note that these are the plain per site log-likes, not 
+         multiplied with the pattern weight value */
+      
+      if(getPerSiteLikelihoods)
+        perSiteLikelihoods[i] = term;
+
+      sum += wptr[i] * term;
+    }     
+  }
+  else
+  {        
+    for (i = 0; i < n; i++) 
+    {
+      /* same as before, only that now we access two inner likelihood vectors x1 and x2 */
+
+      x1 = &(x1_start[span * i]);
+      x2 = &(x2_start[span * i]);                 
+
+      for(j = 0, term = 0.0; j < 4; j++)
+        for(k = 0; k < states; k++)
+          term += x1[j * states + k] * x2[j * states + k] * diagptable[j * states + k];
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i])*log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));
+      
+      if(getPerSiteLikelihoods)
+        perSiteLikelihoods[i] = term;
+
+      sum += wptr[i] * term;
+    }                           
+  }
+
+  return sum;
+} 
+
+#if (defined(__SSE3) || defined(__AVX))
+/** @ingroup evaluateLikelihoodGroup
+    @brief Memory saving version of the generic (and slow) implementation of log likelihood evaluation of a tree using the GAMMA model of rate heterogeneity
+
+    Computes the log likelihood of the topology for a specific partition, assuming
+    that the GAMMA model of rate heterogeneity is used and memory saving technique
+    is enabled. The likelihood is computed at a virtual root placed at an edge whose
+    two end-points (nodes) have the conditional likelihood vectors \a x1 and \a x2. 
+    Furthermore, if \a getPerSiteLikelihoods is set to \b PLL_TRUE, then the log
+    likelihood for each site is also computed and stored at the corresponding position
+    in the array \a perSiteLikelihoods.
+
+    @param fastScaling
+      If set to \b PLL_FALSE, then the likelihood of each site is also multiplied by \a log(PLL_MINLIKELIHOOD) times the number
+      of times it has been scaled down
+
+    @param ex1
+      An array that holds how many times a site has been scaled and points at the entries for node \a p. This
+      parameter is used if \a fastScaling is set to \b PLL_FALSE.
+
+    @param ex2
+      An array that holds how many times a site has been scaled and points at the entries for node \a q. This
+      parameter is used if \a fastScaling is set to \b PLL_TRUE.
+
+    @param wptr
+      Array holding the weight for each site in the compressed partition alignment
+
+    @param x1_start
+      Conditional likelihood vectors for one of the two end-points of the specific edge for which we are evaluating the likelihood
+
+    @param x2_start
+      Conditional likelihood vectors for the other end-point of the specific edge for which we are evaluating the likelihood
+
+    @param tipVector
+      Precomputed table where the number of rows is equal to the number of possible basepair characters for the current data 
+      type, i.e.16 for DNA and 23 for AA, and each rows contains \a states elements each of which contains transition
+      probabilities computed from the eigenvectors of the decomposed Q matrix.
+
+    @param tipX1
+      If one of the two end-points (nodes) of the specific edge (for which we are evaluating the likelihood) is a tip, then
+      this holds a pointer to the sequence data (basepairs) already converted in the internal integer representation, and \a x2
+      holds the conditional likelihood vectors for the internal node.
+
+    @param n
+      Number of sites for which we are doing the evaluation. For the single-thread version this is the 
+      number of sites in the current partition, for multi-threads this is the number of sites assigned
+      to the running thread from the current partition.
+
+    @param diagptable
+      Start of the array that contains the P-Matrix diagonal of the specific edge for which we are
+      evaluating the likehood, and for each category of the GAMMA model
+
+    @param states
+      Number of states (4 for DNA, 20 for AA)
+
+    @param perSiteLikelihoods
+      Array to store per-site log likelihoods if \a getPerSiteLikelihoods is set to \b PLL_TRUE
+
+    @param getPerSiteLikelihoods
+      If set to \b PLL_TRUE then per-site log likelihoods are also computed and stored in \a perSiteLikelihoods
+
+    @param x1_gapColumn
+
+    @param x2_gapColumn
+
+    @param x1_gap
+      Gap bitvector for the left child node
+
+    @param x2_gap
+      Gap bitvector for the right child node
+
+    @return
+      The evaluated log likelihood of the tree topology
+
+    @todo
+      Document x1_gapColumn, x2_gapColumn, x1_gap, x2_gap and add a brief description of how this technique works
+*/
+static double evaluateGAMMA_FLEX_SAVE(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                      double *x1_start, double *x2_start, 
+                                      double *tipVector, 
+                                      unsigned char *tipX1, const int n, double *diagptable, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods,
+                                      double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  double   
+    sum = 0.0, 
+    term,
+    *x1,
+    *x2,
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start;
+    
+  int     
+    i, 
+    j,
+    k;
+
+  /* span is the offset within the likelihood array at an inner node that gets us from the values 
+     of site i to the values of site i + 1 */
+
+  const int 
+    span = states * 4;
+
+
+  /* we distingusih between two cases here: one node of the two nodes defining the branch at which we put the virtual root is 
+     a tip. Both nodes can not be tips because we do not allow for two-taxon trees ;-) 
+     Nota that, if a node is a tip, this will always be tipX1. This is done for code simplicity and the flipping of the nodes
+     is done before when we compute the traversal descriptor.     
+     */
+
+  /* the left node is a tip */
+  if(tipX1)
+  {             
+    /* loop over the sites of this partition */
+    for (i = 0; i < n; i++)
+    {
+      /* access pre-computed tip vector values via a lookup table */
+      x1 = &(tipVector[states * tipX1[i]]);      
+      /* access the other(inner) node at the other end of the branch */
+
+      if(x2_gap[i / 32] & mask32[i % 32])
+        x2 = x2_gapColumn;
+      else
+        {
+          x2 = x2_ptr;
+          x2_ptr += span;
+        }
+
+      /* loop over GAMMA rate categories, hard-coded as 4 in RAxML */
+      for(j = 0, term = 0.0; j < 4; j++)
+        /* loop over states and multiply them with the P matrix */
+        for(k = 0; k < states; k++)
+          term += x1[k] * x2[j * states + k] * diagptable[j * states + k];                                                        
+
+      /* take the log of the likelihood and multiply the per-gamma rate likelihood by 1/4.
+         Under the GAMMA model the 4 discrete GAMMA rates all have the same probability 
+         of 0.25 */
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));
+
+      /* if required get the per-site log likelihoods.
+         note that these are the plain per site log-likes, not 
+         multiplied with the pattern weight value */
+      
+      if(getPerSiteLikelihoods)
+        perSiteLikelihoods[i] = term;
+
+      sum += wptr[i] * term;
+    }     
+  }
+  else
+  {        
+    for (i = 0; i < n; i++) 
+    {
+      /* same as before, only that now we access two inner likelihood vectors x1 and x2 */
+      
+      if(x1_gap[i / 32] & mask32[i % 32])
+        x1 = x1_gapColumn;
+      else
+        {
+          x1 = x1_ptr;
+          x1_ptr += span;
+        }    
+
+      if(x2_gap[i / 32] & mask32[i % 32])
+        x2 = x2_gapColumn;
+      else
+        {
+          x2 = x2_ptr;
+          x2_ptr += span;
+        }                 
+
+      for(j = 0, term = 0.0; j < 4; j++)
+        for(k = 0; k < states; k++)
+          term += x1[j * states + k] * x2[j * states + k] * diagptable[j * states + k];
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i])*log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));
+      
+      if(getPerSiteLikelihoods)
+        perSiteLikelihoods[i] = term;
+
+      sum += wptr[i] * term;
+    }                           
+  }
+
+  return sum;
+} 
+#endif
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief A generic (and slow) implementation of log likelihood evaluation of a tree using the CAT model of rate heterogeneity
+    
+    Computes the log likelihood of the topology for a specific partition, assuming
+    that the CAT model of rate heterogeneity is used. The likelihood is computed at
+    a virtual root placed at an edge whose two end-points (nodes) have the conditional
+    likelihood vectors \a x1 and \a x2. 
+    Furthermore, if \a getPerSiteLikelihoods is set to \b PLL_TRUE, then the log
+    likelihood for each site is also computed and stored at the corresponding position
+    in the array \a perSiteLikelihoods.
+
+    @param fastScaling
+      If set to \b PLL_FALSE, then the likelihood of each site is also multiplied by \a log(PLL_MINLIKELIHOOD) times the number
+      of times it has been scaled down
+
+    @param ex1
+      An array that holds how many times a site has been scaled and points at the entries for node \a p. This
+      parameter is used if \a fastScaling is set to \b PLL_FALSE.
+
+    @param ex2
+      An array that holds how many times a site has been scaled and points at the entries for node \a q. This
+      parameter is used if \a fastScaling is set to \b PLL_TRUE.
+
+    @param cptr
+      Array holding the rate for each site in the compressed partition alignment
+
+    @param wptr
+      Array holding the weight for each site in the compressed partition alignment
+
+    @param x1
+      Conditional likelihood vectors for one of the two end-points of the specific edge for which we are evaluating the likelihood
+
+    @param x2
+      Conditional likelihood vectors for the other end-point of the specific edge for which we are evaluating the likelihood
+
+    @param tipVector
+      Precomputed table where the number of rows is equal to the number of possible basepair characters for the current data type, 
+      i.e.16 for DNA and 23 for AA, and each rows contains \a states elements each of which contains transition probabilities 
+      computed from the eigenvectors of the decomposed Q matrix.
+
+    @param tipX1
+      If one of the two end-points (nodes) of the specific edge (for which we are evaluating the likelihood) is a tip, then
+      this holds a pointer to the sequence data (basepairs) already converted in the internal integer representation, and \a x2
+      holds the conditional likelihood vectors for the internal node.
+
+    @param n
+      Number of sites for which we are doing the evaluation. For the single-thread version this is the number of sites in the
+      current partition, for multi-threads this is the number of sites assigned to the running thread from the current partition.
+
+    @param diagptable_start
+      Start of the array that contains the P-Matrix diagonal of the specific edge for which we are evaluating the likehood,
+      and for each category of the CAT model
+
+    @param states
+      Number of states (4 for DNA, 20 for AA)
+
+    @param perSiteLikelihoods
+      Array to store per-site log likelihoods if \a getPerSiteLikelihoods is set to \b PLL_TRUE
+
+    @param getPerSiteLikelihoods
+      If set to \b PLL_TRUE then per-site log likelihoods are also computed and stored in \a perSiteLikelihoods
+
+    @return
+      The evaluated log likelihood of the tree topology
+*/
+static double evaluateCAT_FLEX (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                double *x1, double *x2, double *tipVector,
+                                unsigned char *tipX1, int n, double *diagptable_start, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods)
+{
+  double   
+    sum = 0.0, 
+    term,
+    *diagptable,  
+    *left, 
+    *right;
+
+  int     
+    i, 
+    l;                           
+
+  /* chosing between tip vectors and non tip vectors is identical in all flavors of this function ,regardless 
+     of whether we are using CAT, GAMMA, DNA or protein data etc */
+
+  if(tipX1)
+  {                 
+    for (i = 0; i < n; i++) 
+    {
+      /* same as in the GAMMA implementation */
+      left = &(tipVector[states * tipX1[i]]);
+      right = &(x2[states * i]);
+
+      /* important difference here, we do not have, as for GAMMA 
+         4 P matrices assigned to each site, but just one. However those 
+         P-Matrices can be different for the sites.
+         Hence we index into the precalculated P-matrices for individual sites 
+         via the category pointer cptr[i]
+         */
+      diagptable = &diagptable_start[states * cptr[i]];                  
+
+      /* similar to gamma, with the only difference that we do not integrate (sum)
+         over the discrete gamma rates, but simply compute the likelihood of the 
+         site and the given P-matrix */
+
+      for(l = 0, term = 0.0; l < states; l++)
+        term += left[l] * right[l] * diagptable[l];                        
+
+      /* take the log */
+       if(!fastScaling)
+         term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+       else
+         term = log(fabs(term));
+
+       /* if required get the per-site log likelihoods.
+          note that these are the plain per site log-likes, not 
+          multiplied with the pattern weight value */
+
+       if(getPerSiteLikelihoods)
+         perSiteLikelihoods[i] = term;
+
+      /* 
+         multiply the log with the pattern weight of this site. 
+         The site pattern for which we just computed the likelihood may 
+         represent several alignment columns sites that have been compressed 
+         into one site pattern if they are exactly identical AND evolve under the same model,
+         i.e., form part of the same partition.
+         */                  
+
+      sum += wptr[i] * term;
+    }      
+  }    
+  else
+  {    
+    for (i = 0; i < n; i++) 
+    {   
+      /* as before we now access the likelihood arrayes of two inner nodes */
+      left  = &x1[states * i];
+      right = &x2[states * i];
+
+      diagptable = &diagptable_start[states * cptr[i]];         
+
+      for(l = 0, term = 0.0; l < states; l++)
+        term += left[l] * right[l] * diagptable[l];
+      
+      if(!fastScaling)
+        term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(term));  
+
+      if(getPerSiteLikelihoods)
+        perSiteLikelihoods[i] = term;
+
+      sum += wptr[i] * term;      
+    }
+  }
+
+  return  sum;         
+} 
+
+#if (defined(__SSE3) || defined(__AVX))
+/** @ingroup evaluateLikelihoodGroup
+    @brief A generic (and slow) implementation of log likelihood evaluation of a tree using the CAT model of rate heterogeneity with memory saving
+    
+    This is the same as ::evaluateCAT_FLEX but with the memory saving technique enabled.
+    Please check ::evaluateCAT_FLEX for more information and a description of the common
+    input parameters
+    
+    @param x1_gapColumn
+
+    @param x2_gapColumn
+
+    @param x1_gap
+      Gap bitvector for the left child node
+
+    @param x2_gap
+      Gap bitvector for the right child node
+    
+    @todo
+      Comment on x1_gapColumn and x2_gapColumn
+*/
+static double evaluateCAT_FLEX_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                     double *x1, double *x2, double *tipVector,
+                                     unsigned char *tipX1, int n, double *diagptable_start, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods,
+                                     double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  double   
+    sum = 0.0, 
+    term,
+    *diagptable,  
+    *left, 
+    *right,
+    *left_ptr = x1,
+    *right_ptr = x2;
+
+  int     
+    i, 
+    l;                           
+
+  /* chosing between tip vectors and non tip vectors is identical in all flavors of this function ,regardless 
+     of whether we are using CAT, GAMMA, DNA or protein data etc */
+
+  if(tipX1)
+  {                 
+    for (i = 0; i < n; i++) 
+    {
+      /* same as in the GAMMA implementation */
+      left = &(tipVector[states * tipX1[i]]);
+   
+      if(isGap(x2_gap, i))
+        right = x2_gapColumn;
+      else
+        {
+          right = right_ptr;
+          right_ptr += states;
+        }         
+      /* important difference here, we do not have, as for GAMMA 
+         4 P matrices assigned to each site, but just one. However those 
+         P-Matrices can be different for the sites.
+         Hence we index into the precalculated P-matrices for individual sites 
+         via the category pointer cptr[i]
+         */
+      diagptable = &diagptable_start[states * cptr[i]];                  
+
+      /* similar to gamma, with the only difference that we do not integrate (sum)
+         over the discrete gamma rates, but simply compute the likelihood of the 
+         site and the given P-matrix */
+
+      for(l = 0, term = 0.0; l < states; l++)
+        term += left[l] * right[l] * diagptable[l];                        
+
+      /* take the log */
+       if(!fastScaling)
+         term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+       else
+         term = log(fabs(term));
+
+       /* if required get the per-site log likelihoods.
+          note that these are the plain per site log-likes, not 
+          multiplied with the pattern weight value */
+
+       if(getPerSiteLikelihoods)
+         perSiteLikelihoods[i] = term;
+
+      /* 
+         multiply the log with the pattern weight of this site. 
+         The site pattern for which we just computed the likelihood may 
+         represent several alignment columns sites that have been compressed 
+         into one site pattern if they are exactly identical AND evolve under the same model,
+         i.e., form part of the same partition.
+         */                  
+
+      sum += wptr[i] * term;
+    }      
+  }    
+  else
+  {    
+    for (i = 0; i < n; i++) 
+    {   
+      /* as before we now access the likelihood arrayes of two inner nodes */     
+
+      if(isGap(x1_gap, i))
+        left = x1_gapColumn;
+      else
+        {
+          left = left_ptr;
+          left_ptr += states;
+        }       
+
+      if(isGap(x2_gap, i))
+        right = x2_gapColumn;
+      else
+        {
+          right = right_ptr;
+          right_ptr += states;
+        }       
+
+      diagptable = &diagptable_start[states * cptr[i]];         
+
+      for(l = 0, term = 0.0; l < states; l++)
+        term += left[l] * right[l] * diagptable[l];
+      
+      if(!fastScaling)
+        term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(term));  
+
+      if(getPerSiteLikelihoods)
+        perSiteLikelihoods[i] = term;
+
+      sum += wptr[i] * term;      
+    }
+  }
+
+  return  sum;         
+} 
+#endif
+
+
+/* This is the core function for computing the log likelihood at a branch */
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluate the log likelihood of a specific branch of the topology
+    
+    Evaluates the likelihood of the tree topology assuming a virtual root is
+    placed at the edge whose end-points are node with number \a pNumber and \a
+    qNumber in the first slot of the traversal descriptor. The function first
+    computes the conditional likelihoods for all necessary nodes (the ones in
+    the traversal descriptor list) by calling the function \a pllNewviewIterative
+    and then evaluates the likelihood at the root. In addition, if \a
+    getPerSiteLikelihoods is set to \b PLL_TRUE, the per-site likelihoods are
+    stored in \a tr->lhs.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param getPerSiteLikelihoods
+      If set to \b PLL_TRUE, compute the log likelihood for each site. 
+
+    @note
+      This is an internal function and should not be called by the user. It assumes
+      that a valid traversal descriptor has already been computed. It also assumes
+      that the edge we are referring to is an edge that leads to a tip, i.e. either
+      p or q of the first entry of traversal descriptor are tips.
+*/
+void pllEvaluateIterative(pllInstance *tr, partitionList *pr, pllBoolean getPerSiteLikelihoods)
+{
+  /* the branch lengths and node indices of the virtual root branch are always the first one that 
+     are stored in the very important traversal array data structure that describes a partial or full tree traversal */
+
+  /* get the branch length at the root */
+  double 
+    *pz = tr->td[0].ti[0].qz;   
+
+  /* get the node number of the node to the left and right of the branch that defines the virtual rooting */
+
+  int    
+    pNumber = tr->td[0].ti[0].pNumber, 
+    qNumber = tr->td[0].ti[0].qNumber, 
+    p_slot,
+    q_slot,
+    model;
+  
+  pllBoolean
+    fastScaling = tr->fastScaling;
+
+  /* the slots are the entries in xVector where the LH vector is available */
+  if(tr->useRecom)
+    {
+      p_slot = tr->td[0].ti[0].slot_p;
+      q_slot = tr->td[0].ti[0].slot_q;
+    }
+  else
+    {
+      p_slot = pNumber - tr->mxtips - 1;
+      q_slot = qNumber - tr->mxtips - 1;
+    }
+  
+  /* before we can compute the likelihood at the virtual root, we need to do a partial or full tree traversal to compute 
+     the conditional likelihoods of the vectors as specified in the traversal descriptor. Maintaining this tarversal descriptor consistent 
+     will unfortunately be the responsibility of users. This is tricky, if as planned for here, we use a rooted view (described somewhere in Felsenstein's book)
+     for the conditional vectors with respect to the tree
+     */
+
+  /* iterate over all valid entries in the traversal descriptor */
+
+  pllNewviewIterative(tr, pr, 1);
+
+  /* after the above call we are sure that we have properly and consistently computed the 
+     conditionals to the right and left of the virtual root and we can now invoke the 
+     the log likelihood computation */
+
+  /* we need to loop over all partitions. Note that we may have a mix of DNA, protein binary data etc partitions */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {    
+      /* whats' the number of sites of this partition (at the current thread) */
+      int           
+        width = pr->partitionData[model]->width;
+      
+      /* 
+         Important part of the tarversal descriptor: 
+         figure out if we need to recalculate the likelihood of this 
+         partition: 
+         
+         The reasons why this is important in terms of performance are given in this paper 
+         here which you should actually read:
+         
+         A. Stamatakis, M. Ott: "Load Balance in the Phylogenetic Likelihood Kernel". Proceedings of ICPP 2009, accepted for publication, Vienna, Austria, September 2009
+         
+         The width > 0 check is for checking if under the cyclic data distribution of per-partition sites to threads this thread does indeed have a site 
+         of the current partition.
+         
+      */
+
+      if(tr->td[0].executeModel[model] && width > 0)
+        {       
+          int 
+#if (defined(__SSE3) || defined(__AVX))
+            rateHet = (int)discreteRateCategories(tr->rateHetModel),
+#endif
+            categories,
+            ascWidth = pr->partitionData[model]->states,
+            
+            /* get the number of states in the partition, e.g.: 4 = DNA, 20 = Protein */
+            
+            states = pr->partitionData[model]->states,
+            *ex1 = NULL,
+            *ex2 = NULL,
+            *ex1_asc = NULL,
+            *ex2_asc = NULL;
+          
+          double 
+            *rateCategories = (double*)NULL,
+            z, 
+            partitionLikelihood = 0.0,
+            *x1_start           = NULL,
+            *x2_start           = NULL,
+            *diagptable         = NULL,
+            *x1_start_asc       = NULL,
+            *x2_start_asc       = NULL;
+
+#if (defined(__SSE3) || defined(__AVX))
+          double
+            *x1_gapColumn = (double*)NULL,
+            *x2_gapColumn = (double*)NULL;
+#endif
+          
+#if (defined(__SSE3) || defined(__AVX))
+          unsigned int
+            *x1_gap = (unsigned int*)NULL,
+            *x2_gap = (unsigned int*)NULL;       
+#endif
+          
+          unsigned char 
+            *tip = (unsigned char*)NULL;          
+          
+          /* 
+             figure out if we are using the CAT or GAMMA model of rate heterogeneity 
+             and set pointers to the rate heterogeneity rate arrays and also set the 
+             number of distinct rate categories appropriately.
+             
+             Under GAMMA this is constant and hard-coded as 4, weheras under CAT 
+             the number of site-wise rate categories can vary in the course of computations 
+             up to a user defined maximum value of site categories (default: 25)
+          */
+
+          if(tr->rateHetModel == PLL_CAT)
+            {        
+              rateCategories = pr->partitionData[model]->perSiteRates;
+              categories = pr->partitionData[model]->numberOfCategories;
+            }
+          else  /* GAMMA */
+            {        
+              rateCategories = pr->partitionData[model]->gammaRates;
+              categories = 4;
+            }
+          
+          /* set this pointer to the memory area where space has been reserved a priori for storing the 
+             P matrix at the root */
+          
+          diagptable = pr->partitionData[model]->left;
+          
+          /* figure out if we need to address tip vectors (a char array that indexes into a precomputed tip likelihood 
+             value array) or if we need to address inner vectors */
+          
+          /* either node p or node q is a tip */
+          
+          if(isTip(pNumber, tr->mxtips) || isTip(qNumber, tr->mxtips))
+            {                       
+              /* q is a tip */
+              
+              if(isTip(qNumber, tr->mxtips))
+                {       
+                  /* get the start address of the inner likelihood vector x2 for partition model,
+                     note that inner nodes are enumerated/indexed starting at 0 to save allocating some 
+                     space for additional pointers */
+
+                  x2_start = pr->partitionData[model]->xVector[p_slot];
+                  
+                  /* get the corresponding tip vector */
+                  
+                  tip      = pr->partitionData[model]->yVector[qNumber];
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+                  if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
+#else
+                  if (pr->partitionData[model]->ascBias)
+#endif
+                   {
+                     x2_start_asc  = &pr->partitionData[model]->ascVector[(pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                     ex2_asc       = &pr->partitionData[model]->ascExpVector[(pNumber - tr->mxtips - 1) * ascWidth];
+                   }
+
+                  
+                  /* memory saving stuff, let's deal with this later or ask Fernando ;-) */
+                  
+#if (defined(__SSE3) || defined(__AVX))
+                  if(tr->saveMemory)
+                    {
+                      x2_gap         = &(pr->partitionData[model]->gapVector[pNumber * pr->partitionData[model]->gapVectorLength]);
+                      x2_gapColumn   = &(pr->partitionData[model]->gapColumn[(pNumber - tr->mxtips - 1) * states * rateHet]);
+                    }
+#endif
+                  /* per site likelihood scaling */
+
+                  if(!fastScaling)                  
+                    ex2 = pr->partitionData[model]->expVector[p_slot];              
+                }           
+              else
+                {       
+                  /* p is a tip, same as above */
+                  
+                  x2_start = pr->partitionData[model]->xVector[q_slot];
+                  tip = pr->partitionData[model]->yVector[pNumber];
+
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+                  if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
+#else
+                  if (pr->partitionData[model]->ascBias)
+#endif
+                   {
+                     x2_start_asc  = &pr->partitionData[model]->ascVector[(qNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                     ex2_asc       = &pr->partitionData[model]->ascExpVector[(qNumber - tr->mxtips - 1) * ascWidth];
+                   }
+                  
+#if (defined(__SSE3) || defined(__AVX))
+                  if(tr->saveMemory)
+                    {
+                      x2_gap         = &(pr->partitionData[model]->gapVector[qNumber * pr->partitionData[model]->gapVectorLength]);
+                      x2_gapColumn   = &(pr->partitionData[model]->gapColumn[(qNumber - tr->mxtips - 1) * states * rateHet]);
+                    }
+#endif
+
+                  /* per site likelihood scaling */
+
+                  if(!fastScaling)                  
+                    ex2 = pr->partitionData[model]->expVector[q_slot];             
+                }
+            }
+          else
+            {  
+              
+              assert(p_slot != q_slot);
+              /* neither p nor q are tips, hence we need to get the addresses of two inner vectors */
+              
+              x1_start = pr->partitionData[model]->xVector[p_slot];
+              x2_start = pr->partitionData[model]->xVector[q_slot];
+              
+              /* memory saving option */
+              
+#if (defined(__SSE3) || defined(__AVX))
+              if(tr->saveMemory)
+                {
+                  x1_gap = &(pr->partitionData[model]->gapVector[pNumber * pr->partitionData[model]->gapVectorLength]);
+                  x2_gap = &(pr->partitionData[model]->gapVector[qNumber * pr->partitionData[model]->gapVectorLength]);
+                  x1_gapColumn   = &pr->partitionData[model]->gapColumn[(pNumber - tr->mxtips - 1) * states * rateHet];
+                  x2_gapColumn   = &pr->partitionData[model]->gapColumn[(qNumber - tr->mxtips - 1) * states * rateHet];
+                }
+#endif
+                      
+              /* per site likelihood scaling */
+
+              if(!fastScaling)
+                {
+                  ex1      = pr->partitionData[model]->expVector[p_slot];
+                  ex2      = pr->partitionData[model]->expVector[q_slot];     
+                }
+              
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+              if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
+#else
+              if (pr->partitionData[model]->ascBias)
+#endif
+               {
+                 x1_start_asc  = &pr->partitionData[model]->ascVector[(pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                 x2_start_asc  = &pr->partitionData[model]->ascVector[(qNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+
+                 ex1_asc       = &pr->partitionData[model]->ascExpVector[(pNumber - tr->mxtips - 1) * ascWidth];
+                 ex2_asc       = &pr->partitionData[model]->ascExpVector[(qNumber - tr->mxtips - 1) * ascWidth];
+               }
+
+
+
+            }
+          
+          
+          /* if we are using a per-partition branch length estimate, the branch has an index, otherwise, for a joint branch length
+             estimate over all partitions we just use the branch length value with index 0 */
+          
+          if(pr->perGeneBranchLengths)
+            z = pz[model];
+          else
+            z = pz[0];
+          
+          /* calc P-Matrix at root for branch z connecting nodes p and q */
+          
+          if(pr->partitionData[model]->dataType == PLL_AA_DATA && (pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X))
+            calcDiagptableFlex_LG4(z, 4, pr->partitionData[model]->gammaRates, pr->partitionData[model]->EIGN_LG4, diagptable, 20);
+          else
+            calcDiagptable(z, states, categories, rateCategories, pr->partitionData[model]->EIGN, diagptable);
+          
+#if (!defined(__SSE3) && !defined(__AVX) && !defined(__MIC_NATIVE))
+          
+          /* generic slow functions, memory saving option is not implemented for these */
+          
+          assert(!tr->saveMemory);
+          
+          /* decide wheter CAT or GAMMA is used and compute log like */
+          if(tr->rateHetModel == PLL_CAT)
+            partitionLikelihood = evaluateCAT_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt, 
+                                                x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, getPerSiteLikelihoods);
+          else
+            partitionLikelihood = evaluateGAMMA_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
+                                                x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, getPerSiteLikelihoods);
+#else
+   
+          /* if we want to compute the per-site likelihoods, we use the generic evaluate function implementations 
+             for this, because the slowdown is not that dramatic */
+
+          if(getPerSiteLikelihoods)
+            {         
+#ifdef __MIC_NATIVE
+                          // not supported on MIC!
+                          assert(0 && "Per-site LH calculations is not implemented on Intel MIC");
+#else
+               if(tr->rateHetModel == PLL_CAT)
+                {
+                   if(tr->saveMemory)
+                     partitionLikelihood = evaluateCAT_FLEX_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
+                                                                 x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                 tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE,
+                                                                 x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+                   else
+                     partitionLikelihood = evaluateCAT_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
+                                                            x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                            tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE);
+                }
+              else
+                {
+                  if(tr->saveMemory)
+                    partitionLikelihood = evaluateGAMMA_FLEX_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
+                                                                  x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                  tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE, 
+                                                                  x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);              
+                  else
+                    partitionLikelihood = evaluateGAMMA_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
+                                                             x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                             tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE);
+                }
+#endif
+            }
+          else
+            {
+              /* for the optimized functions we have a dedicated, optimized function implementation 
+                 for each rate heterogeneity and data type combination, we switch over the number of states 
+                 and the rate heterogeneity model */
+              
+              switch(states)
+                {         
+                case 2: /* binary */
+                  assert (!tr->saveMemory);
+                  if (tr->rateHetModel == PLL_CAT)
+                   {
+                     partitionLikelihood =  evaluateGTRCAT_BINARY(ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
+                                                                  x1_start, x2_start, pr->partitionData[model]->tipVector, 
+                                                                  tip, width, diagptable, fastScaling);
+                   }
+                  else
+                   {
+                     partitionLikelihood = evaluateGTRGAMMA_BINARY(ex1, ex2, pr->partitionData[model]->wgt,
+                                                                   x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                   tip, width, diagptable, fastScaling);                 
+                   }
+                  break;
+                case 4: /* DNA */
+                  {
+
+#ifdef __MIC_NATIVE
+
+                  /* CAT & memory saving are not supported on MIC */
+
+                  assert(!tr->saveMemory);
+                  assert(tr->rateHetModel == PLL_GAMMA);
+
+                  partitionLikelihood =  evaluateGTRGAMMA_MIC(ex1, ex2, pr->partitionData[model]->wgt,
+                                              x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                              tip, width, diagptable, fastScaling);
+#else
+                    if(tr->rateHetModel == PLL_CAT)
+                      {                           
+                        if(tr->saveMemory)
+                          partitionLikelihood =  evaluateGTRCAT_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
+                                                                     x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                     tip, width, diagptable, x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+                        else
+                          partitionLikelihood =  evaluateGTRCAT(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
+                                                                x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                tip, width, diagptable);
+                      }
+                    else
+                      {         
+                        if(tr->saveMemory)                 
+                          partitionLikelihood =  evaluateGTRGAMMA_GAPPED_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
+                                                                              x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                              tip, width, diagptable,
+                                                                              x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);                  
+                        else
+                          partitionLikelihood =  evaluateGTRGAMMA(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
+                                                                  x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                  tip, width, diagptable);                                
+                      }
+#endif
+                  }
+                  break;                                   
+                case 20: /* proteins */
+                  {
+
+#ifdef __MIC_NATIVE
+
+                  /* CAT & memory saving are not supported on MIC */
+
+                  assert(!tr->saveMemory);
+                  assert(tr->rateHetModel == PLL_GAMMA);
+
+                  if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                    partitionLikelihood =  evaluateGTRGAMMAPROT_LG4_MIC(pr->partitionData[model]->wgt,
+                                                                    x1_start, x2_start, pr->partitionData[model]->tipVector_LG4,
+                                                                    tip, width, diagptable, pr->partitionData[model]->lg4x_weights);
+                  else
+                        partitionLikelihood =  evaluateGTRGAMMAPROT_MIC(ex1, ex2, pr->partitionData[model]->wgt,
+                                              x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                              tip, width, diagptable, fastScaling);
+
+//                  printf("tip: %p, width: %d,  lh: %f\n", tip, width, partitionLikelihood);
+//                  int g;
+//                  if (x1_start)
+//                                        for (g = 0; g < 20; ++g)
+//                                                printf("%f \t", x1_start[g]);
+//                  printf("\n");
+//                  if (x2_start)
+//                                        for (g = 0; g < 20; ++g)
+//                                                printf("%f \t", x2_start[g]);
+#else
+
+                      if(tr->rateHetModel == PLL_CAT)
+                      {                           
+                        if(tr->saveMemory)
+                          partitionLikelihood = evaluateGTRCATPROT_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
+                                                                        x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                        tip, width, diagptable,  x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+                        else
+                          partitionLikelihood = evaluateGTRCATPROT(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
+                                                                   x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                   tip, width, diagptable);               
+                      }
+                    else
+                      {                                               
+                        if(tr->saveMemory)
+                          partitionLikelihood = evaluateGTRGAMMAPROT_GAPPED_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
+                                                                                 x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                                 tip, width, diagptable,
+                                                                                 x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+                        else
+                      {
+                        if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                          partitionLikelihood =  evaluateGTRGAMMAPROT_LG4((int *)NULL, (int *)NULL, pr->partitionData[model]->wgt,
+                                                                          x1_start, x2_start, pr->partitionData[model]->tipVector_LG4,
+                                                                          tip, width, diagptable, PLL_TRUE, pr->partitionData[model]->lg4x_weights);
+                        else
+                          partitionLikelihood = evaluateGTRGAMMAPROT(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
+                                                                     x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                     tip, width, diagptable);           
+                      }
+                      }
+#endif
+                  }
+                  break;                            
+                default:
+                  assert(0);        
+                }
+            }
+#endif
+              
+          /* check that there was no major numerical screw-up, the log likelihood should be < 0.0 always */
+          
+          assert(partitionLikelihood < 0.0);
+          
+          /* now here is a nasty part, for each partition and each node we maintain an integer counter to count how often 
+             how many entries per node were scaled by a constant factor. Here we use this information generated during Felsenstein's 
+             pruning algorithm by the newview() functions to undo the preceding scaling multiplications at the root, for mathematical details 
+             you should actually read:
+             
+             A. Stamatakis: "Orchestrating the Phylogenetic Likelihood Function on Emerging Parallel Architectures". 
+             In B. Schmidt, editor, Bioinformatics: High Performance Parallel Computer Architectures, 85-115, CRC Press, Taylor & Francis, 2010.
+             
+             There's a copy of this book in my office 
+          */
+          
+          if(fastScaling)
+            partitionLikelihood += (pr->partitionData[model]->globalScaler[pNumber] + pr->partitionData[model]->globalScaler[qNumber]) * log(PLL_MINLIKELIHOOD);
+          
+          /* now we have the correct log likelihood for the current partition after undoing scaling multiplications */           
+          
+          /* finally, we also store the per partition log likelihood which is important for optimizing the alpha parameter 
+             of this partition for example */
+
+          /* asc bias stuff */
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+          if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
+#else
+          if (pr->partitionData[model]->ascBias)
+#endif
+           {
+             size_t
+               i;
+             
+             int        
+               w = 0;
+             
+             double                                
+               correction;
+
+             switch(tr->rateHetModel)
+               {
+               case PLL_CAT:
+                 {
+                   double 
+                     rates = 1.0;
+                   
+                   //need to re-calculate P-matrix for the correction here assuming a rate of 1.0 
+                   calcDiagptable(z, states, 1, &rates, pr->partitionData[model]->EIGN, diagptable);
+                   
+                   
+                   correction = evaluateCatAsc(ex1_asc, ex2_asc, x1_start_asc, x2_start_asc, pr->partitionData[model]->tipVector,
+                                               tip, ascWidth, diagptable, ascWidth);
+                 }
+                 break;
+               case PLL_GAMMA:                       
+                 correction = evaluateGammaAsc(ex1_asc, ex2_asc, x1_start_asc, x2_start_asc, pr->partitionData[model]->tipVector,
+                                               tip, ascWidth, diagptable, ascWidth);
+                 break;
+               default:
+                 assert(0);
+               }
+             
+             
+             
+             for(i = (size_t)pr->partitionData[model]->lower; i < (size_t)pr->partitionData[model]->upper; i++)
+               w += tr->aliaswgt[i];
+
+             partitionLikelihood = partitionLikelihood - (double)w * log(1.0 - correction);                  
+              
+           }
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+          if(!(pr->partitionData[model]->ascBias && tr->threadID == 0))
+           {
+#endif
+             if(partitionLikelihood >= 0.0)
+               {
+                 printf("positive log like: %f for partition %d\n", partitionLikelihood, model);
+                 assert(0);
+               }
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+           }
+#endif
+
+          
+          pr->partitionData[model]->partitionLH = partitionLikelihood;
+        }
+      else
+        {
+          /* if the current thread does not have a single site of this partition
+             it is important to set the per partition log like to 0.0 because 
+             of the reduction operation that will take place later-on.
+             That is, the values of tr->perPartitionLH across all threads 
+             need to be in a consistent state, always !
+          */
+          
+          if(width == 0)            
+            pr->partitionData[model]->partitionLH = 0.0;
+        }
+    }
+
+
+#ifdef DEBUG_PERSITE_LNL
+  /* per persite-stuff */
+  {
+    int model = 0; 
+    for(model = 0; model < pr->numberOfPartitions ; ++model)
+      {
+        int j= 0; 
+        pInfo *partition  =  pr->partitionData[model]; 
+        for(j = 0;  j < partition->width; ++j)
+          printf("[%d] lnl[%d]=%f\n", tr->threadID, j, partition->perSiteLikelihoods[j]); 
+
+      }
+  }
+
+#endif
+}
+
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluate the log likelihood of the tree topology
+
+    Evaluate the log likelihood of the tree topology of instance \a tr by
+    assuming a virtual root between nodes \a p and \a p->back. If
+    \a fullTraversal is set to \b PLL_TRUE then the log likelihood vectors for
+    each node are recomputed from scratch.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Specifies the virtual root, which is assumed to be a (virtual node) connecting \a p and \a p->back
+
+    @param fullTraversal
+      If set to \b PLL_TRUE, then the likelihood vectors at all nodes are recomputed, otherwise only the
+      necessary vectors (those that are not oriented in the right direction) are recomputed.
+
+    @param getPerSiteLikelihoods
+      Also compute and store (in \a tr->lhs) the log likelihood of each site of the (compressed) alignment
+
+    @note
+      If \a getPerSiteLikelihoods is set to \b PLL_TRUE, then make sure that \a tr->fastScaling is set to
+      \b PLL_FALSE, otherwise an assertion will fail.
+*/
+void pllEvaluateLikelihood (pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean fullTraversal, pllBoolean getPerSiteLikelihoods)
+{
+  /* now this may be the entry point of the library to compute 
+     the log like at a branch defined by p and p->back == q */
+
+  volatile double 
+    result = 0.0;
+
+  nodeptr 
+    q = p->back; 
+  
+
+  pllBoolean
+        p_recom = PLL_FALSE, /* if one of was missing, we will need to force recomputation */
+        q_recom = PLL_FALSE;
+
+  int
+    i,
+    model,
+    numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions : 1;
+
+  /* if evaluate shall return the per-site log likelihoods 
+     fastScaling needs to be disabled, otherwise this will 
+     not work */
+
+  if(getPerSiteLikelihoods)          
+    assert(!(tr->fastScaling)); 
+
+  /* set the first entry of the traversal descriptor to contain the indices
+     of nodes p and q */
+
+  tr->td[0].ti[0].pNumber = p->number;
+  tr->td[0].ti[0].qNumber = q->number;          
+
+  /* copy the branch lengths of the tree into the first entry of the traversal descriptor.
+     if -M is not used tr->numBranches must be 1 */
+
+  for(i = 0; i < numBranches; i++)
+    tr->td[0].ti[0].qz[i] =  q->z[i];
+
+  /* recom part */
+  if(tr->useRecom)
+  {
+    int slot = -1;
+    if(!isTip(q->number, tr->mxtips))
+    {
+      q_recom = getxVector(tr->rvec, q->number, &slot, tr->mxtips);
+      tr->td[0].ti[0].slot_q = slot;
+    }
+    if(!isTip(p->number, tr->mxtips))
+    {
+      p_recom = getxVector(tr->rvec, p->number, &slot, tr->mxtips);
+      tr->td[0].ti[0].slot_p = slot;
+    }
+    if(!isTip(p->number, tr->mxtips) &&  !isTip(q->number, tr->mxtips))
+      assert(tr->td[0].ti[0].slot_q != tr->td[0].ti[0].slot_p);
+  }
+
+
+  /* now compute how many conditionals must be re-computed/re-oriented by newview
+     to be able to calculate the likelihood at the root defined by p and q.
+     */
+
+  /* one entry in the traversal descriptor is already used, hence set the tarversal length counter to 1 */
+  tr->td[0].count = 1;
+
+  if(fullTraversal)
+  {
+    assert(isTip(q->back->number, tr->mxtips));
+    computeTraversal(tr, q, PLL_FALSE, numBranches);
+  }
+  else
+  {
+    if(p_recom || needsRecomp(tr->useRecom, tr->rvec, p, tr->mxtips))
+      computeTraversal(tr, p, PLL_TRUE, numBranches);
+
+    if(q_recom || needsRecomp(tr->useRecom, tr->rvec, q, tr->mxtips))
+      computeTraversal(tr, q, PLL_TRUE, numBranches);
+  }
+
+
+  /* now we copy this partition execute mask into the traversal descriptor which must come from the 
+     calling program, the logic of this should not form part of the library */
+
+  storeExecuteMaskInTraversalDescriptor(tr, pr);
+
+  /* also store in the traversal descriptor that something has changed i.e., in the parallel case that the 
+     traversal descriptor list of nodes needs to be broadcast once again */
+
+  tr->td[0].traversalHasChanged = PLL_TRUE;
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+
+  /* now here we enter the fork-join region for Pthreads */
+
+
+  /* start the parallel region and tell all threads to compute the log likelihood for 
+     their fraction of the data. This call is implemented in the case switch of execFunction in axml.c
+     */
+  if(getPerSiteLikelihoods)
+    {
+      memset(tr->lhs, 0, sizeof(double) * tr->originalCrunchedLength); 
+      pllMasterBarrier(tr, pr, PLL_THREAD_EVALUATE_PER_SITE_LIKES);
+    }
+  else
+    pllMasterBarrier (tr, pr, PLL_THREAD_EVALUATE);
+
+  /* and now here we explicitly do the reduction operation , that is add over the 
+     per-thread and per-partition log likelihoods to obtain the overall log like 
+     over all sites and partitions */
+
+ 
+  /* 
+     for unpartitioned data that's easy, we just sum over the log likes computed 
+     by each thread, thread 0 stores his results in reductionBuffer[0] thread 1 in 
+     reductionBuffer[1] and so on 
+     */
+
+  /* This reduction for the partitioned case is more complicated because each thread 
+     needs to store the partial log like of each partition and we then need to collect 
+     and add everything */
+
+#else
+  /* and here is just the sequential case, we directly call pllEvaluateIterative() above 
+     without having to tell the threads/processes that they need to compute this function now */
+
+  pllEvaluateIterative(tr, pr, getPerSiteLikelihoods); //PLL_TRUE
+
+  /*
+    if we want to obtain per-site rates they have initially been stored 
+     in arrays that are associated to the partition, now we 
+     copy them into the vector tr->lhs[].
+     We may also chose that the user needs to rpovide an array, but this can be decided later-on.
+  */
+
+  if(getPerSiteLikelihoods) //PLL_TRUE
+    {
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        memcpy(&(tr->lhs[pr->partitionData[model]->lower]), pr->partitionData[model]->perSiteLikelihoods, pr->partitionData[model]->width  * sizeof(double));
+    }
+
+#endif
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    result += pr->partitionData[model]->partitionLH;
+
+  /* set the tree data structure likelihood value to the total likelihood */
+
+  tr->likelihood = result;    
+
+  /* the code below is mainly for testing if the per-site log 
+     likelihoods we have stored in tr->lhs yield the same 
+     likelihood as the likelihood we computed. 
+     For numerical reasons we need to make a dirt PLL_ABS(difference) < epsilon
+     comparison */
+     
+  if(getPerSiteLikelihoods) //PLL_TRUE
+    {
+      double 
+        likelihood = 0;
+      int i; 
+
+      /* note that in tr->lhs, we just store the likelihood of 
+         one representative of a potentially compressed pattern,
+         hence, we need to multiply the elemnts with the pattern 
+         weight vector */
+
+
+      for(i = 0; i < tr->originalCrunchedLength; i++)
+        {
+//          printf("lhs[%d]=%f * %d\n", i, tr->lhs[i], tr->aliaswgt[i]); 
+          likelihood += (tr->lhs[i]   * tr->aliaswgt[i] );
+        }
+         
+      if( PLL_ABS(tr->likelihood - likelihood) > 0.00001)
+        {
+  //        printf("likelihood was %f\t summed/weighted per-site-lnl was %f\n", tr->likelihood, likelihood); 
+        }
+
+        assert(PLL_ABS(tr->likelihood - likelihood) < 0.00001);
+    }
+
+
+  if(tr->useRecom)
+  {
+    unpinNode(tr->rvec, p->number, tr->mxtips);
+    unpinNode(tr->rvec, q->number, tr->mxtips);
+  }
+
+  /* do some bookkeeping to have traversalHasChanged in a consistent state */
+
+  tr->td[0].traversalHasChanged = PLL_FALSE;
+}
+
+
+void perSiteLogLikelihoods(pllInstance *tr, partitionList *pr, double *logLikelihoods)
+{
+#if (!defined(_USE_PTHREADS) && !defined(_FINE_GRAIN_MPI))
+  double 
+    //likelihood,
+    accumulatedPerSiteLikelihood = 0.0;
+
+  size_t
+    localCount,
+    i,
+    //globalCounter,
+    lower,
+    upper;
+  int model;
+#endif
+  /* compute the likelihood of the tree with the standard function to:
+     1. obtain the current score for error checking
+     2. store a full tree traversal in the traversal descriptor that 
+     will then be used for calculating per-site log likelihoods 
+     for each site individually and independently */
+
+  pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+  //likelihood = tr->likelihood;
+
+  /* now compute per-site log likelihoods using the respective functions */
+
+#if (defined( _USE_PTHREADS ) || defined(_FINE_GRAIN_MPI))
+  /* here we need a barrier to invoke a parallel region that calls 
+     function 
+     perSiteLogLikelihoodsPthreads(tree *tr, partitionList *pr, double *lhs, int n, int tid)
+     defined above and subsequently collects the per-site log likelihoods 
+     computed by the threads and stored in local per-thread memory 
+     and stores them in buffer tr->lhs.
+     This corresponds to a gather operation in MPI.
+     */
+
+  pllMasterBarrier (tr, pr, PLL_THREAD_PER_SITE_LIKELIHOODS);
+
+  /* 
+     when the parallel region has terminated, the per-site log likelihoods 
+     are stored in array tr->lhs of the master thread which we copy to the result buffer
+  */
+  
+  memcpy(logLikelihoods, tr->lhs, sizeof(double) * tr->originalCrunchedLength);
+
+
+#else
+
+  /* sequential case: just loop over all partitions and compute per site log likelihoods */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+  {
+    lower = pr->partitionData[model]->lower;
+    upper = pr->partitionData[model]->upper;
+
+    for(i = lower, localCount = 0; i < upper; i++, localCount++)
+    {
+      double 
+        l;
+
+      /* 
+         we need to switch of rate heterogeneity implementations here.
+         when we have PSR we actually need to provide the per-site rate 
+         to the function evaluatePartialGeneric() that computes the 
+         per-site log likelihood.
+         Under GAMMA, the rate will just be ignored, here we just set it to 1.0
+         */
+
+      switch(tr->rateHetModel)
+      {
+        case PLL_CAT:
+          l = evaluatePartialGeneric (tr, pr, i, pr->partitionData[model]->perSiteRates[pr->partitionData[model]->rateCategory[localCount]], model);
+          break;
+        case PLL_GAMMA:
+          l = evaluatePartialGeneric (tr, pr, i, 1.0, model);
+          break;
+        default:
+          assert(0);
+      }
+
+      /* store value in result array and add the likelihood of this site to the overall likelihood */
+
+      logLikelihoods[i] = l;
+      accumulatedPerSiteLikelihood += l;
+    } 
+  }
+
+
+  /* error checking. We need a dirt PLL_ABS() < epsilon here, because the implementations 
+     (standard versus per-site) are pretty different and hence slight numerical 
+     deviations are expected */
+
+  assert(PLL_ABS(tr->likelihood - accumulatedPerSiteLikelihood) < 0.00001);
+  
+#endif
+  
+
+
+}
+
+#if (defined(__SSE3) || defined(__AVX))
+static double evaluateGTRCAT_BINARY (int *ex1, int *ex2, int *cptr, int *wptr,
+                                     double *x1_start, double *x2_start, double *tipVector,                   
+                                     unsigned char *tipX1, int n, double *diagptable_start, const pllBoolean fastScaling)
+{
+  double  sum = 0.0, term;       
+  int     i;
+#if (!defined(__SSE3) && !defined(__AVX))
+  int j;  
+#endif
+  double  *diagptable, *x1, *x2;                            
+ 
+  if(tipX1)
+    {          
+      for (i = 0; i < n; i++) 
+        {
+#if (defined(__SSE3) || defined(__AVX))
+          PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+#endif
+          x1 = &(tipVector[2 * tipX1[i]]);
+          x2 = &(x2_start[2 * i]);
+          
+          diagptable = &(diagptable_start[2 * cptr[i]]);                          
+        
+#if (defined(__SSE3) || defined(__AVX))
+          _mm_store_pd(t, _mm_mul_pd(_mm_load_pd(x1), _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(diagptable))));
+          
+          if(fastScaling)
+            term = log(fabs(t[0] + t[1]));
+          else
+            term = log(fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));                           
+#else               
+          for(j = 0, term = 0.0; j < 2; j++)                         
+            term += x1[j] * x2[j] * diagptable[j];            
+                 
+          if(fastScaling)
+            term = log(fabs(term));
+          else
+            term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));                                                      
+#endif    
+
+          sum += wptr[i] * term;
+        }       
+    }               
+  else
+    {
+      for (i = 0; i < n; i++) 
+        {       
+#if (defined(__SSE3) || defined(__AVX))
+		  PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+#endif                  
+          x1 = &x1_start[2 * i];
+          x2 = &x2_start[2 * i];
+          
+          diagptable = &diagptable_start[2 * cptr[i]];            
+#if (defined(__SSE3) || defined(__AVX))
+          _mm_store_pd(t, _mm_mul_pd(_mm_load_pd(x1), _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(diagptable))));
+          
+          if(fastScaling)
+            term = log(fabs(t[0] + t[1]));
+          else
+            term = log(fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));                        
+#else     
+          for(j = 0, term = 0.0; j < 2; j++)
+            term += x1[j] * x2[j] * diagptable[j];   
+          
+          if(fastScaling)
+            term = log(fabs(term));
+          else
+            term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+#endif
+          
+          sum += wptr[i] * term;
+        }          
+    }
+       
+  return  sum;         
+} 
+
+
+static double evaluateGTRGAMMA_BINARY(int *ex1, int *ex2, int *wptr,
+                                      double *x1_start, double *x2_start, 
+                                      double *tipVector, 
+                                      unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling)
+{
+  double   sum = 0.0, term;    
+  int     i, j;
+#if (!defined(__SSE3) && !defined(__AVX))
+  int k;
+#endif 
+  double  *x1, *x2;             
+
+  if(tipX1)
+    {          
+      for (i = 0; i < n; i++)
+        {
+#if (defined(__SSE3) || defined(__AVX))
+		  PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+          __m128d termv, x1v, x2v, dv;
+#endif
+          x1 = &(tipVector[2 * tipX1[i]]);       
+          x2 = &x2_start[8 * i];                                
+#if (defined(__SSE3) || defined(__AVX))
+          termv = _mm_set1_pd(0.0);                
+          
+          for(j = 0; j < 4; j++)
+            {
+              x1v = _mm_load_pd(&x1[0]);
+              x2v = _mm_load_pd(&x2[j * 2]);
+              dv   = _mm_load_pd(&diagptable[j * 2]);
+              
+              x1v = _mm_mul_pd(x1v, x2v);
+              x1v = _mm_mul_pd(x1v, dv);
+              
+              termv = _mm_add_pd(termv, x1v);                 
+            }
+          
+          _mm_store_pd(t, termv);               
+          
+          if(fastScaling)
+            term = log(0.25 * (fabs(t[0] + t[1])));
+          else
+            term = log(0.25 * (fabs(t[0] + t[1]))) + (ex2[i] * log(PLL_MINLIKELIHOOD));       
+#else
+          for(j = 0, term = 0.0; j < 4; j++)
+            for(k = 0; k < 2; k++)
+              term += x1[k] * x2[j * 2 + k] * diagptable[j * 2 + k];                                                
+          
+          if(fastScaling)
+            term = log(0.25 * fabs(term));
+          else
+            term = log(0.25 * fabs(term)) + ex2[i] * log(PLL_MINLIKELIHOOD);
+#endif   
+          
+          sum += wptr[i] * term;
+        }         
+    }
+  else
+    {         
+      for (i = 0; i < n; i++) 
+        {
+#if (defined(__SSE3) || defined(__AVX))
+		  PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+          __m128d termv, x1v, x2v, dv;
+#endif                            
+          x1 = &x1_start[8 * i];
+          x2 = &x2_start[8 * i];
+                  
+#if (defined(__SSE3) || defined(__AVX))
+          termv = _mm_set1_pd(0.0);                
+          
+          for(j = 0; j < 4; j++)
+            {
+              x1v = _mm_load_pd(&x1[j * 2]);
+              x2v = _mm_load_pd(&x2[j * 2]);
+              dv   = _mm_load_pd(&diagptable[j * 2]);
+              
+              x1v = _mm_mul_pd(x1v, x2v);
+              x1v = _mm_mul_pd(x1v, dv);
+              
+              termv = _mm_add_pd(termv, x1v);                 
+            }
+          
+          _mm_store_pd(t, termv);
+          
+          
+          if(fastScaling)
+            term = log(0.25 * (fabs(t[0] + t[1])));
+          else
+            term = log(0.25 * (fabs(t[0] + t[1]))) + ((ex1[i] +ex2[i]) * log(PLL_MINLIKELIHOOD));     
+#else     
+          for(j = 0, term = 0.0; j < 4; j++)
+            for(k = 0; k < 2; k++)
+              term += x1[j * 2 + k] * x2[j * 2 + k] * diagptable[j * 2 + k];                                          
+
+          if(fastScaling)
+            term = log(0.25 * fabs(term));
+          else
+            term = log(0.25 * fabs(term)) + (ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD);
+#endif
+
+          sum += wptr[i] * term;
+        }                       
+    }
+
+  return sum;
+} 
+#endif
+
+
+
+/* below are the optimized function versions with geeky intrinsics */
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree under the GAMMA model of rate heterogeneity and LG4 model of evolution
+    
+    This is the same as ::evaluateGAMMA_FLEX but for the LG4 model. It contains two implementations,
+    one which is the generic, and one that is optimized with SSE3 instructions. The two implementations
+    are separated by preprocessor macros.
+    The difference from ::evaluateGAMMA_FLEX is that we have 4 different tipVectors computed from the 4 different
+    Q matrix decompositions.
+    Please check ::evaluateGAMMA_FLEX for more information and a description of the common
+    input parameters.
+*/
+static double evaluateGTRGAMMAPROT_LG4(int *ex1, int *ex2, int *wptr,
+                                       double *x1, double *x2,  
+                                       double *tipVector[4], 
+                                       unsigned char *tipX1, int n, double *diagptable, const pllBoolean fastScaling,
+                                       double * lg4_weights)
+{
+  double   sum = 0.0, term;        
+  int     i, j, l;   
+  double  *left, *right;              
+  
+  if(tipX1)
+    {               
+      for (i = 0; i < n; i++) 
+        {
+#if (defined(__SSE3) || defined(__AVX))
+          __m128d tv = _mm_setzero_pd();
+                                  
+          for(j = 0, term = 0.0; j < 4; j++)
+            {
+              double *d = &diagptable[j * 20];
+
+              __m128d
+              	  t = _mm_setzero_pd(),
+              	  w = _mm_set1_pd(lg4_weights[j]);
+
+              left = &(tipVector[j][20 * tipX1[i]]);
+              right = &(x2[80 * i + 20 * j]);
+              for(l = 0; l < 20; l+=2)
+                {
+                  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+                  t = _mm_add_pd(t, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
+                }
+              tv = _mm_add_pd(tv, _mm_mul_pd(t, w));
+            }
+
+          tv = _mm_hadd_pd(tv, tv);
+          _mm_storel_pd(&term, tv);
+          
+
+#else                             
+          for(j = 0, term = 0.0; j < 4; j++)
+            {
+        	  double t = 0.0;
+
+              left = &(tipVector[j][20 * tipX1[i]]);
+              right = &(x2[80 * i + 20 * j]);
+
+              for(l = 0; l < 20; l++)
+                t += left[l] * right[l] * diagptable[j * 20 + l];
+
+              term += lg4_weights[j] * t;
+            }     
+#endif
+          
+          if(fastScaling)
+            term = log(fabs(term));
+          else
+            term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+
+          sum += wptr[i] * term;
+
+        }               
+    }              
+  else
+    {
+      for (i = 0; i < n; i++) 
+        {                                    
+#if (defined(__SSE3) || defined(__AVX))
+          __m128d tv = _mm_setzero_pd();                          
+              
+          for(j = 0, term = 0.0; j < 4; j++)
+            {
+              double *d = &diagptable[j * 20];
+
+              __m128d
+              t = _mm_setzero_pd(),
+              w = _mm_set1_pd(lg4_weights[j]);
+
+              left  = &(x1[80 * i + 20 * j]);
+              right = &(x2[80 * i + 20 * j]);
+              
+              for(l = 0; l < 20; l+=2)
+                {
+                  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+                  t = _mm_add_pd(t, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
+                }
+              tv = _mm_add_pd(tv, _mm_mul_pd(t, w));
+            }
+          tv = _mm_hadd_pd(tv, tv);
+          _mm_storel_pd(&term, tv);       
+#else
+          for(j = 0, term = 0.0; j < 4; j++)
+            {
+        	  double t = 0.0;
+
+              left  = &(x1[80 * i + 20 * j]);
+              right = &(x2[80 * i + 20 * j]);       
+              
+              for(l = 0; l < 20; l++)
+                t += left[l] * right[l] * diagptable[j * 20 + l];
+
+              term += lg4_weights[j] * t;
+            }
+#endif
+          
+          if(fastScaling)
+            term = log(fabs(term));
+          else
+            term = log(fabs(term)) + ((ex1[i] + ex2[i])*log(PLL_MINLIKELIHOOD));
+          
+          sum += wptr[i] * term;
+        }         
+    }
+
+  return  sum;
+}
+
+#if (defined(__SSE3) || defined(__AVX))
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity 
+    and the memory saving technique (Optimized SSE3 version for AA data)
+ 
+    This is the SSE3 optimized version of ::evaluateGAMMA_FLEX_SAVE for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX_SAVE for more information and
+    a description of the input parameters
+*/
+static double evaluateGTRGAMMAPROT_GAPPED_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                                double *x1, double *x2,  
+                                                double *tipVector, 
+                                                unsigned char *tipX1, int n, double *diagptable, 
+                                                double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)                                    
+{
+  double   sum = 0.0, term;        
+  int     i, j, l;   
+  double  
+    *left, 
+    *right,
+    *x1_ptr = x1,
+    *x2_ptr = x2,
+    *x1v,
+    *x2v;              
+  __m128d tv;
+
+  if(tipX1)
+  {               
+    for (i = 0; i < n; i++) 
+    {
+      if(x2_gap[i / 32] & mask32[i % 32])
+        x2v = x2_gapColumn;
+      else
+      {
+        x2v = x2_ptr;
+        x2_ptr += 80;
+      }
+
+	  //TUNG: Standard C does not allow declaration after executable statement
+	  tv = _mm_setzero_pd();
+      //__m128d tv = _mm_setzero_pd();
+      left = &(tipVector[20 * tipX1[i]]);                 
+
+      for(j = 0, term = 0.0; j < 4; j++)
+      {
+        double *d = &diagptable[j * 20];
+        right = &(x2v[20 * j]);
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+          tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));                
+        }                               
+      }
+
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);
+
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));    
+
+      sum += wptr[i] * term;
+    }                   
+  }              
+  else
+  {
+    for (i = 0; i < n; i++) 
+    {
+      if(x1_gap[i / 32] & mask32[i % 32])
+        x1v = x1_gapColumn;
+      else
+      {
+        x1v = x1_ptr;
+        x1_ptr += 80;
+      }
+
+      if(x2_gap[i / 32] & mask32[i % 32])
+        x2v = x2_gapColumn;
+      else
+      {
+        x2v = x2_ptr;
+        x2_ptr += 80;
+      }
+
+      //__m128d tv = _mm_setzero_pd(); 
+	  tv = _mm_setzero_pd();
+
+      for(j = 0, term = 0.0; j < 4; j++)
+      {
+        double *d = &diagptable[j * 20];
+        left  = &(x1v[20 * j]);
+        right = &(x2v[20 * j]);
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+          tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));                
+        }                               
+      }
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);   
+
+
+       if(!fastScaling)
+        term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));
+
+
+      sum += wptr[i] * term;
+    }         
+  }
+
+  return  sum;
+}
+
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity 
+    (Optimized SSE3 version for AA data)
+ 
+    This is the SSE3 optimized version of ::evaluateGAMMA_FLEX for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX for more information and
+    a description of the common input parameters
+*/
+static double evaluateGTRGAMMAPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                    double *x1, double *x2,  
+                                    double *tipVector, 
+                                    unsigned char *tipX1, int n, double *diagptable)
+{
+  double   sum = 0.0, term;        
+  int     i, j, l;   
+  double  *left, *right;              
+
+  if(tipX1)
+  {               
+    for (i = 0; i < n; i++) 
+    {
+
+      __m128d tv = _mm_setzero_pd();
+      left = &(tipVector[20 * tipX1[i]]);                 
+
+      for(j = 0, term = 0.0; j < 4; j++)
+      {
+        double *d = &diagptable[j * 20];
+        right = &(x2[80 * i + 20 * j]);
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+          tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));                
+        }                               
+      }
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);
+
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));
+
+
+      sum += wptr[i] * term;
+    }                   
+  }              
+  else
+  {
+    for (i = 0; i < n; i++) 
+    {                                
+      __m128d tv = _mm_setzero_pd();                      
+
+      for(j = 0, term = 0.0; j < 4; j++)
+      {
+        double *d = &diagptable[j * 20];
+        left  = &(x1[80 * i + 20 * j]);
+        right = &(x2[80 * i + 20 * j]);
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+          tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));                
+        }                               
+      }
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);   
+
+
+       if(!fastScaling)
+        term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));
+
+
+      sum += wptr[i] * term;
+    }
+  }
+
+  return  sum;
+}
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity 
+    (Optimized SSE3 version for AA data)
+ 
+    This is the SSE3 optimized version of ::evaluateCAT_FLEX for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX for more information and
+    a description of the common input parameters
+*/
+static double evaluateGTRCATPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                  double *x1, double *x2, double *tipVector,
+                                  unsigned char *tipX1, int n, double *diagptable_start)
+{
+  double   sum = 0.0, term;
+  double  *diagptable,  *left, *right;
+  int     i, l;                           
+  __m128d tv;
+
+  if(tipX1)
+  {                 
+    for (i = 0; i < n; i++) 
+    {           
+      left = &(tipVector[20 * tipX1[i]]);
+      right = &(x2[20 * i]);
+
+      diagptable = &diagptable_start[20 * cptr[i]];                      
+
+	  //TUNG: Standard C does not allow declaration after executable statement
+	  tv = _mm_setzero_pd();
+      //__m128d tv = _mm_setzero_pd();        
+
+      for(l = 0; l < 20; l+=2)
+      {
+        __m128d lv = _mm_load_pd(&left[l]);
+        __m128d rv = _mm_load_pd(&right[l]);
+        __m128d mul = _mm_mul_pd(lv, rv);
+        __m128d dv = _mm_load_pd(&diagptable[l]);
+
+        tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));                  
+      }                         
+
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);
+
+      if(!fastScaling)
+        term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(term));
+
+      sum += wptr[i] * term;
+    }      
+  }    
+  else
+  {
+
+    for (i = 0; i < n; i++) 
+    {                                 
+      left  = &x1[20 * i];
+      right = &x2[20 * i];
+
+      diagptable = &diagptable_start[20 * cptr[i]];             
+
+      __m128d tv = _mm_setzero_pd();        
+
+      for(l = 0; l < 20; l+=2)
+      {
+        __m128d lv = _mm_load_pd(&left[l]);
+        __m128d rv = _mm_load_pd(&right[l]);
+        __m128d mul = _mm_mul_pd(lv, rv);
+        __m128d dv = _mm_load_pd(&diagptable[l]);
+
+        tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));                  
+      }                         
+
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);
+
+      if(!fastScaling)
+        term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(term));  
+
+      sum += wptr[i] * term;      
+    }
+  }
+
+  return  sum;         
+} 
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity with memory saving 
+    (Optimized SSE3 version for AA data)
+ 
+    This is the SSE3 optimized version of ::evaluateCAT_FLEX_SAVE for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX_SAVE for more information and
+    a description of the common input parameters
+*/
+static double evaluateGTRCATPROT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                       double *x1, double *x2, double *tipVector,
+                                       unsigned char *tipX1, int n, double *diagptable_start, 
+                                       double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  double   
+    sum = 0.0, 
+        term,
+        *diagptable,  
+        *left, 
+        *right,
+        *left_ptr = x1,
+        *right_ptr = x2;
+
+  int     
+    i, 
+    l;                           
+
+  if(tipX1)
+  {                 
+    for (i = 0; i < n; i++) 
+    {           
+      left = &(tipVector[20 * tipX1[i]]);
+
+      if(isGap(x2_gap, i))
+        right = x2_gapColumn;
+      else
+      {
+        right = right_ptr;
+        right_ptr += 20;
+      }          
+
+      diagptable = &diagptable_start[20 * cptr[i]];                      
+
+      __m128d tv = _mm_setzero_pd();        
+
+      for(l = 0; l < 20; l+=2)
+      {
+        __m128d lv = _mm_load_pd(&left[l]);
+        __m128d rv = _mm_load_pd(&right[l]);
+        __m128d mul = _mm_mul_pd(lv, rv);
+        __m128d dv = _mm_load_pd(&diagptable[l]);
+
+        tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));                  
+      }                         
+
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);
+
+      if(!fastScaling)
+        term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(term));
+
+      sum += wptr[i] * term;
+    }      
+  }    
+  else
+  {
+
+    for (i = 0; i < n; i++) 
+    {                                     
+      if(isGap(x1_gap, i))
+        left = x1_gapColumn;
+      else
+      {
+        left = left_ptr;
+        left_ptr += 20;
+      }
+
+      if(isGap(x2_gap, i))
+        right = x2_gapColumn;
+      else
+      {
+        right = right_ptr;
+        right_ptr += 20;
+      }
+
+      diagptable = &diagptable_start[20 * cptr[i]];             
+
+      __m128d tv = _mm_setzero_pd();        
+
+      for(l = 0; l < 20; l+=2)
+      {
+        __m128d lv = _mm_load_pd(&left[l]);
+        __m128d rv = _mm_load_pd(&right[l]);
+        __m128d mul = _mm_mul_pd(lv, rv);
+        __m128d dv = _mm_load_pd(&diagptable[l]);
+
+        tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));                  
+      }                         
+
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);
+
+      if(!fastScaling)
+        term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(term));  
+
+      sum += wptr[i] * term;      
+    }
+  }
+
+  return  sum;         
+} 
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity with memory saving 
+    (Optimized SSE3 version for DNA data)
+ 
+    This is the SSE3 optimized version of ::evaluateCAT_FLEX_SAVE for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX_SAVE for more information and
+    a description of the common input parameters
+*/
+static double evaluateGTRCAT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                   double *x1_start, double *x2_start, double *tipVector,                     
+                                   unsigned char *tipX1, int n, double *diagptable_start,
+                                   double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  double  sum = 0.0, term;       
+  int     i;
+
+  double  *diagptable, 
+          *x1, 
+          *x2,
+          *x1_ptr = x1_start,
+          *x2_ptr = x2_start;
+
+  if(tipX1)
+  {           
+    for (i = 0; i < n; i++) 
+    {   
+    	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+      __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+
+      x1 = &(tipVector[4 * tipX1[i]]);
+
+      if(isGap(x2_gap, i))
+        x2 = x2_gapColumn;
+      else
+      {
+        x2 = x2_ptr;
+        x2_ptr += 4;
+      }
+
+      diagptable = &diagptable_start[4 * cptr[i]];
+
+      x1v1 =  _mm_load_pd(&x1[0]);
+      x1v2 =  _mm_load_pd(&x1[2]);
+      x2v1 =  _mm_load_pd(&x2[0]);
+      x2v2 =  _mm_load_pd(&x2[2]);
+      dv1  =  _mm_load_pd(&diagptable[0]);
+      dv2  =  _mm_load_pd(&diagptable[2]);
+
+      x1v1 = _mm_mul_pd(x1v1, x2v1);
+      x1v1 = _mm_mul_pd(x1v1, dv1);
+
+      x1v2 = _mm_mul_pd(x1v2, x2v2);
+      x1v2 = _mm_mul_pd(x1v2, dv2);
+
+      x1v1 = _mm_add_pd(x1v1, x1v2);
+
+      _mm_store_pd(t, x1v1);
+
+      if(!fastScaling)
+        term = log(fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(t[0] + t[1]));
+
+
+
+      sum += wptr[i] * term;
+    }   
+  }               
+  else
+  {
+    for (i = 0; i < n; i++) 
+    { 
+    	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+      __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+
+      if(isGap(x1_gap, i))
+        x1 = x1_gapColumn;
+      else
+      {
+        x1 = x1_ptr;
+        x1_ptr += 4;
+      }
+
+      if(isGap(x2_gap, i))
+        x2 = x2_gapColumn;
+      else
+      {
+        x2 = x2_ptr;
+        x2_ptr += 4;
+      }
+
+      diagptable = &diagptable_start[4 * cptr[i]];      
+
+      x1v1 =  _mm_load_pd(&x1[0]);
+      x1v2 =  _mm_load_pd(&x1[2]);
+      x2v1 =  _mm_load_pd(&x2[0]);
+      x2v2 =  _mm_load_pd(&x2[2]);
+      dv1  =  _mm_load_pd(&diagptable[0]);
+      dv2  =  _mm_load_pd(&diagptable[2]);
+
+      x1v1 = _mm_mul_pd(x1v1, x2v1);
+      x1v1 = _mm_mul_pd(x1v1, dv1);
+
+      x1v2 = _mm_mul_pd(x1v2, x2v2);
+      x1v2 = _mm_mul_pd(x1v2, dv2);
+
+      x1v1 = _mm_add_pd(x1v1, x1v2);
+
+      _mm_store_pd(t, x1v1);
+
+
+       if(!fastScaling)
+        term = log(fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(t[0] + t[1]));
+
+      sum += wptr[i] * term;
+    }    
+  }
+
+  return  sum;         
+} 
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity with memory saving 
+    (Optimized SSE3 version for DNA data)
+ 
+    This is the SSE3 optimized version of ::evaluateGAMMA_FLEX_SAVE for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX_SAVE for more information and
+    a description of the common input parameters
+*/
+static double evaluateGTRGAMMA_GAPPED_SAVE(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                           double *x1_start, double *x2_start, 
+                                           double *tipVector, 
+                                           unsigned char *tipX1, const int n, double *diagptable,
+                                           double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  double   sum = 0.0, term;    
+  int     i, j;
+  double  
+    *x1, 
+    *x2,
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start;
+
+
+
+  if(tipX1)
+  {        
+
+
+    for (i = 0; i < n; i++)
+    {
+    	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+      __m128d termv, x1v, x2v, dv;
+
+      x1 = &(tipVector[4 * tipX1[i]]);   
+      if(x2_gap[i / 32] & mask32[i % 32])
+        x2 = x2_gapColumn;
+      else
+      {
+        x2 = x2_ptr;     
+        x2_ptr += 16;
+      }
+
+
+      termv = _mm_set1_pd(0.0);            
+
+      for(j = 0; j < 4; j++)
+      {
+        x1v = _mm_load_pd(&x1[0]);
+        x2v = _mm_load_pd(&x2[j * 4]);
+        dv   = _mm_load_pd(&diagptable[j * 4]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+
+        x1v = _mm_load_pd(&x1[2]);
+        x2v = _mm_load_pd(&x2[j * 4 + 2]);
+        dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+      }
+
+      _mm_store_pd(t, termv);            
+
+       if(!fastScaling)
+        term = log(0.25 * fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(t[0] + t[1]));
+
+
+      sum += wptr[i] * term;
+    }     
+  }
+  else
+  {        
+
+    for (i = 0; i < n; i++) 
+    {
+    	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+      __m128d termv, x1v, x2v, dv;
+
+      if(x1_gap[i / 32] & mask32[i % 32])
+        x1 = x1_gapColumn;
+      else
+      {
+        x1 = x1_ptr;              
+        x1_ptr += 16;
+      }
+
+      if(x2_gap[i / 32] & mask32[i % 32])
+        x2 = x2_gapColumn;
+      else
+      {
+        x2 = x2_ptr;
+        x2_ptr += 16;
+      }
+
+      termv = _mm_set1_pd(0.0);          
+
+      for(j = 0; j < 4; j++)
+      {
+        x1v = _mm_load_pd(&x1[j * 4]);
+        x2v = _mm_load_pd(&x2[j * 4]);
+        dv   = _mm_load_pd(&diagptable[j * 4]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+
+        x1v = _mm_load_pd(&x1[j * 4 + 2]);
+        x2v = _mm_load_pd(&x2[j * 4 + 2]);
+        dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+      }
+
+      _mm_store_pd(t, termv);
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(t[0] + t[1]));
+
+
+      sum += wptr[i] * term;
+    }                           
+  }
+
+  return sum;
+} 
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity (Optimized SSE3 version for DNA data)
+ 
+    This is the SSE3 optimized version of ::evaluateGAMMA_FLEX for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX for more information and
+    a description of the common input parameters
+*/
+static double evaluateGTRGAMMA(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                               double *x1_start, double *x2_start, 
+                               double *tipVector, 
+                               unsigned char *tipX1, const int n, double *diagptable)
+{
+  double   sum = 0.0, term;    
+  int     i, j;
+
+  double  *x1, *x2;             
+
+
+
+  if(tipX1)
+  {             
+    for (i = 0; i < n; i++)
+    {
+    	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+      __m128d termv, x1v, x2v, dv;
+
+      x1 = &(tipVector[4 * tipX1[i]]);   
+      x2 = &x2_start[16 * i];    
+
+
+      termv = _mm_set1_pd(0.0);            
+
+      for(j = 0; j < 4; j++)
+      {
+        x1v = _mm_load_pd(&x1[0]);
+        x2v = _mm_load_pd(&x2[j * 4]);
+        dv   = _mm_load_pd(&diagptable[j * 4]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+
+        x1v = _mm_load_pd(&x1[2]);
+        x2v = _mm_load_pd(&x2[j * 4 + 2]);
+        dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+      }
+
+      _mm_store_pd(t, termv);
+
+
+       if(!fastScaling)
+        term = log(0.25 * fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(t[0] + t[1]));
+
+
+
+      sum += wptr[i] * term;
+    }     
+  }
+  else
+  {        
+    for (i = 0; i < n; i++) 
+    {
+    	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+      __m128d termv, x1v, x2v, dv;
+
+
+      x1 = &x1_start[16 * i];
+      x2 = &x2_start[16 * i];             
+
+
+      termv = _mm_set1_pd(0.0);          
+
+      for(j = 0; j < 4; j++)
+      {
+        x1v = _mm_load_pd(&x1[j * 4]);
+        x2v = _mm_load_pd(&x2[j * 4]);
+        dv   = _mm_load_pd(&diagptable[j * 4]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+
+        x1v = _mm_load_pd(&x1[j * 4 + 2]);
+        x2v = _mm_load_pd(&x2[j * 4 + 2]);
+        dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+      }
+
+      _mm_store_pd(t, termv);
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(t[0] + t[1]));
+
+
+
+      sum += wptr[i] * term;
+    }                           
+  }
+
+  return sum;
+} 
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity (Optimized SSE3 version for DNA data)
+ 
+    This is the SSE3 optimized version of ::evaluateCAT_FLEX for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX for more information and
+    a description of the common input parameters
+*/
+static double evaluateGTRCAT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                              double *x1_start, double *x2_start, double *tipVector,                  
+                              unsigned char *tipX1, int n, double *diagptable_start)
+{
+  double  sum = 0.0, term;       
+  int     i;
+
+  double  *diagptable, *x1, *x2;                            
+
+  if(tipX1)
+  {           
+    for (i = 0; i < n; i++) 
+    {   
+    	PLL_ALIGN_BEGIN	double t[2] PLL_ALIGN_END;
+      __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+
+      x1 = &(tipVector[4 * tipX1[i]]);
+      x2 = &x2_start[4 * i];
+
+      diagptable = &diagptable_start[4 * cptr[i]];
+
+
+      x1v1 =  _mm_load_pd(&x1[0]);
+      x1v2 =  _mm_load_pd(&x1[2]);
+      x2v1 =  _mm_load_pd(&x2[0]);
+      x2v2 =  _mm_load_pd(&x2[2]);
+      dv1  =  _mm_load_pd(&diagptable[0]);
+      dv2  =  _mm_load_pd(&diagptable[2]);
+
+      x1v1 = _mm_mul_pd(x1v1, x2v1);
+      x1v1 = _mm_mul_pd(x1v1, dv1);
+
+      x1v2 = _mm_mul_pd(x1v2, x2v2);
+      x1v2 = _mm_mul_pd(x1v2, dv2);
+
+      x1v1 = _mm_add_pd(x1v1, x1v2);
+
+      _mm_store_pd(t, x1v1);
+
+       if(!fastScaling)
+        term = log(fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(t[0] + t[1]));
+
+
+      sum += wptr[i] * term;
+    }   
+  }               
+  else
+  {
+    for (i = 0; i < n; i++) 
+    { 
+    	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+      __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+
+      x1 = &x1_start[4 * i];
+      x2 = &x2_start[4 * i];
+
+      diagptable = &diagptable_start[4 * cptr[i]];      
+
+
+      x1v1 =  _mm_load_pd(&x1[0]);
+      x1v2 =  _mm_load_pd(&x1[2]);
+      x2v1 =  _mm_load_pd(&x2[0]);
+      x2v2 =  _mm_load_pd(&x2[2]);
+      dv1  =  _mm_load_pd(&diagptable[0]);
+      dv2  =  _mm_load_pd(&diagptable[2]);
+
+      x1v1 = _mm_mul_pd(x1v1, x2v1);
+      x1v1 = _mm_mul_pd(x1v1, dv1);
+
+      x1v2 = _mm_mul_pd(x1v2, x2v2);
+      x1v2 = _mm_mul_pd(x1v2, dv2);
+
+      x1v1 = _mm_add_pd(x1v1, x1v2);
+
+      _mm_store_pd(t, x1v1);
+
+      if(!fastScaling)
+        term = log(fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(t[0] + t[1]));
+
+
+      sum += wptr[i] * term;
+    }    
+  }
+
+  return  sum;         
+} 
+
+
+
+
+
+#endif
diff --git a/pll/evaluatePartialGenericSpecial.c b/pll/evaluatePartialGenericSpecial.c
new file mode 100644
index 0000000..4d461a5
--- /dev/null
+++ b/pll/evaluatePartialGenericSpecial.c
@@ -0,0 +1,1378 @@
+/*  RAxML-VI-HPC (version 2.2) a program for sequential and parallel estimation of phylogenetic trees 
+ *  Copyright August 2006 by Alexandros Stamatakis
+ *
+ *  Partially derived from
+ *  fastDNAml, a program for estimation of phylogenetic trees from sequences by Gary J. Olsen
+ *  
+ *  and 
+ *
+ *  Programs of the PHYLIP package by Joe Felsenstein.
+ 
+ *  This program is free software; you may redistribute it and/or modify its
+ *  under the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ * 
+ *
+ *  For any other enquiries send an Email to Alexandros Stamatakis
+ *  Alexandros.Stamatakis at epfl.ch
+ *
+ *  When publishing work that is based on the results from RAxML-VI-HPC please cite:
+ *
+ *  Alexandros Stamatakis:"RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models". 
+ *  Bioinformatics 2006; doi: 10.1093/bioinformatics/btl446
+ */
+
+#include "mem_alloc.h"
+
+#ifndef WIN32 
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+#include "pll.h"
+#include "pllInternal.h"
+
+#ifdef __SSE3
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+#endif
+
+
+/* optimized implementation for computing per-site log likelihoods under CAT and GAMMA for DNA and protein data */
+
+#if (defined(__SSE3) || defined(__AVX))
+static __inline void computeVectorGTRCATPROT(double *lVector, int *eVector, double ki, int i, double qz, double rz,
+					   traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+					   unsigned  char **yVector, int mxtips);
+
+static double evaluatePartialGTRCATPROT(int i, double ki, int counter,  traversalInfo *ti, double qz,
+					int w, double *EIGN, double *EI, double *EV,
+					double *tipVector, unsigned char **yVector, 
+					int branchReference, int mxtips);
+
+static __inline void computeVectorGTRGAMMAPROT(double *lVector, int *eVector, double *gammaRates, int i, double qz, double rz,
+					     traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+					     unsigned  char **yVector, int mxtips);
+
+static double evaluatePartialGTRGAMMAPROT(int i, int counter,  traversalInfo *ti, double qz,
+					  int w, double *EIGN, double *EI, double *EV,
+					  double *tipVector, unsigned char **yVector, 
+					  double *gammaRates,
+					  int branchReference, int mxtips);
+
+static __inline void computeVectorGTRCAT(double *lVector, int *eVector, double ki, int i, double qz, double rz,
+				       traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+				       unsigned char **yVector, int mxtips);
+
+static double evaluatePartialGTRCAT(int i, double ki, int counter,  traversalInfo *ti, double qz,
+				    int w, double *EIGN, double *EI, double *EV,
+				    double *tipVector, unsigned  char **yVector, 
+				    int branchReference, int mxtips);
+
+static __inline void computeVectorGTRCAT_BINARY(double *lVector, int *eVector, double ki, int i, double qz, double rz,
+					      traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+					      unsigned char **yVector, int mxtips);
+
+static double evaluatePartialGTRCAT_BINARY(int i, double ki, int counter,  traversalInfo *ti, double qz,
+					   int w, double *EIGN, double *EI, double *EV,
+					   double *tipVector, unsigned  char **yVector, 
+					   int branchReference, int mxtips);
+
+static double evaluatePartialGTRGAMMA(int i, int counter,  traversalInfo *ti, double qz,
+				      int w, double *EIGN, double *EI, double *EV,
+				      double *tipVector, unsigned char **yVector, 
+				      double *gammaRates,
+				      int branchReference, int mxtips);
+#endif
+
+/* the next two functions are generic non-optimized versions of the per-site log likelihood calculations,
+   but only under the CAT model. There are no generic implementations available for GAMMA yet, since 
+   these functions were not needed in RAxML. However there exist optimized functions for GAMMA further below.
+   The only use of the CAT functions was to optimize per-site rates based on their likelihood for the CAT 
+   model of rate heterogeneity. */
+
+
+static __inline void computeVectorCAT_FLEX(double *lVector, int *eVector, double ki, int i, double qz, double rz,
+					 traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+					 unsigned char **yVector, int mxtips, const int states)
+{      
+  /* allocate some space we need */
+ 
+  double  
+    *d1 =    (double *)rax_malloc(sizeof(double) * states), 
+    *d2 =    (double *)rax_malloc(sizeof(double) * states),  
+    *x1px2 = (double *)rax_malloc(sizeof(double) * states), 
+    ump_x1, 
+    ump_x2,    
+    lz1, 
+    lz2,
+    *x1, 
+    *x2, 
+    *x3;
+  
+  int 
+    scale,
+    j, 
+    k,
+    pNumber = ti->pNumber,
+    rNumber = ti->rNumber,
+    qNumber = ti->qNumber;
+ 
+  /* 
+     lVector holds the space for computing ancestral probablities on a single column of the tree 
+     hence under CAT we index the current space required to store the parent ancestral probability vector 
+     by multiplying the number of states with the offset in the array given by the inner node number
+   */
+
+  x3  = &lVector[states * (pNumber  - mxtips)];  
+ 
+  /* do a case switch to figure out how to index the child nodes x1 and x2,
+     analogous to the standard newview implementation.
+     Note the index i that we use to index the specific tip poistion/index 
+     for which we want to compute the per-site log likelihood */
+
+  switch(ti->tipCase)
+    {
+    case PLL_TIP_TIP:     
+      x1 = &(tipVector[states * yVector[qNumber][i]]);
+      x2 = &(tipVector[states * yVector[rNumber][i]]);    
+      break;
+    case PLL_TIP_INNER:     
+      x1 = &(tipVector[states * yVector[qNumber][i]]);
+      x2 = &(lVector[states * (rNumber - mxtips)]);           
+      break;
+    case PLL_INNER_INNER:            
+      x1 = &(lVector[states * (qNumber - mxtips)]);
+      x2 = &(lVector[states * (rNumber - mxtips)]);     
+      break;
+    default:
+      assert(0);
+    }
+     
+  /* multiply the branch lengths with the evolutionary rate */
+
+  lz1 = qz * ki;  
+  lz2 = rz * ki;
+  
+
+  /* exponentiate the branch lengths using the eigenvalues */
+
+  d1[0] = x1[0];
+  d2[0] = x2[0];
+
+
+  for(j = 1; j < states; j++)
+    {
+      d1[j] = x1[j] * exp(EIGN[j] * lz1);
+      d2[j] = x2[j] * exp(EIGN[j] * lz2);	    
+    }
+ 
+ 
+  /* now loop over all states */
+
+  for(j = 0; j < states; j++)
+    {         
+      ump_x1 = 0.0;
+      ump_x2 = 0.0;
+
+      for(k = 0; k < states; k++)
+	{
+	  ump_x1 += d1[k] * EI[j * states + k];
+	  ump_x2 += d2[k] * EI[j * states + k];
+	}
+      
+      x1px2[j] = ump_x1 * ump_x2;
+    }
+  
+  for(j = 0; j < states; j++)
+    x3[j] = 0.0;
+
+  /* multiply the result of looping over all states with the eigenvector matrix EV */
+
+  for(j = 0; j < states; j++)          
+    for(k = 0; k < states; k++)	
+      x3[k] +=  x1px2[j] *  EV[states * j + k];	   
+      
+  /* now determine if we need to scale the #states entries in x[3] to avoid 
+     numerical underflow. */
+     
+
+  scale = 1;
+  for(j = 0; scale && (j < states); j++)
+    scale = ((x3[j] < PLL_MINLIKELIHOOD) && (x3[j] > PLL_MINUSMINLIKELIHOOD));
+  
+  /* if we need to scale, we multiply all probabilities of the site with 2^256 
+     and increment the scaling counter by 1. 
+     The counter eVector is used for tracking/counting the number of scaling events 
+     at the site i for which we are computing the per-site log likelihood such that 
+     we can "undo" the scaling multiplications when we compute the log likelihood of the site 
+     at the virtual root */
+  
+  if(scale)
+    {
+      for(j = 0; j < states; j++)
+	x3[j] *= PLL_TWOTOTHE256;       
+      *eVector = *eVector + 1;
+    }	              
+
+  rax_free(d1);
+  rax_free(d2);
+  rax_free(x1px2);
+       
+  return;
+}
+
+
+/* the following function computes the per-site log likelihood of a given site i at the virtual root of the tree.
+   as input it takes the indeix i, of the site, the evolutionary rate ki (for computing Q^(rt) where r = ki) 
+   the traversalDescriptor defining the full tree traversal (felsenstein pruning algo) 
+   the branch length at the root qz, the weigth of the site pattern w, i.e., how many identical sites have been compressed 
+   into the current site pattern, the eigenvalues etc (EIGN, EI, EV) associated to the Eigenvector/Eigenvalue decomposition 
+   of the given instataneous substitution matrix Q, the tipVector lookup table for obtaining tip probability vectors, 
+   a pointer to the raw sequence data at the tips, a branch index (to get the correct branch length/index into the correct branch 
+   if -M is used, i.e., a per-partition branch length estimate is deployed, and finally the maximum number of tips in the comprehensive tree 
+   as well as the number of states in the current model. */
+
+#if (!defined(__SSE3) && !defined(__AVX))
+static double evaluatePartialCAT_FLEX(int i, double ki, int counter,  traversalInfo *ti, double qz,
+				      int w, double *EIGN, double *EI, double *EV,
+				      double *tipVector, unsigned  char **yVector, 
+				      int branchReference, int mxtips, const int states)
+{
+  int 
+    scale = 0, 
+    k;
+  
+  double 
+    /* lVector is a temporary buffer to store the ancestral probability vactors of 
+       a single site, thus we allocate states * mxtips space for storing probability values.
+       Essentially  only (states * (mxtips - 2)) space would be required, but I was to lazy 
+       to think if it has to be -1 or -2 here */
+    * lVector = NULL,   
+    * d = NULL,
+    lz, 
+    term, 
+    *x1, 
+    *x2; 
+
+  
+
+  traversalInfo 
+    *trav = &ti[0];
+ 
+  rax_posix_memalign ((void **)&lVector, PLL_BYTE_ALIGNMENT, sizeof(double) * states * mxtips);
+  rax_posix_memalign ((void **)&d,       PLL_BYTE_ALIGNMENT, sizeof(double) * states);
+  /* make sure that at one end of the branch into which we have placed the virtual root 
+     there actually is a tip!*/
+
+  assert(isTip(trav->pNumber, mxtips));
+     
+  /* for the tip we alread have the data, so just set the left probability vector to the 
+     corresponding address in the pre-computed tipVector[] lookup table */
+
+  x1 = &(tipVector[states *  yVector[trav->pNumber][i]]);   
+
+  /* now iterate over the traversal descriptor that contains the nodes of the tree in the order required 
+     by the Felsenstein pruning algorithm */
+
+  for(k = 1; k < counter; k++)    
+    {
+      /* obtain the branch lengths and take the logarithms */
+      
+      double 
+	qz = ti[k].qz[branchReference],
+	rz = ti[k].rz[branchReference];
+      
+      qz = (qz > PLL_ZMIN) ? log(qz) : log(PLL_ZMIN);
+      rz = (rz > PLL_ZMIN) ? log(rz) : log(PLL_ZMIN);
+
+      /* invoke essentially a newview() for one site on the entry k of the traversal descriptor.
+	 counter should always correspond to the number of inner nodes in the tree for which we need
+	 to compute ancestral probability values */
+
+      computeVectorCAT_FLEX(lVector, &scale, ki, i, qz, rz, &ti[k], 
+			    EIGN, EI, EV, 
+			    tipVector, yVector, mxtips, states);       
+    }
+   
+  /* now the ancestral probability values for site i at the node to the right of the virtual root 
+     are available and correctly computed, such that we can set the pointer to the right vector x2
+     to the corresponding entry */
+
+  x2 = &lVector[states * (trav->qNumber - mxtips)]; 
+
+  /* a paranoic assertion */
+
+  assert(0 <=  (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips);  
+ 
+  /* now just compute the log likelihood score of this site */
+      
+  if(qz < PLL_ZMIN) 
+    lz = PLL_ZMIN;
+  lz  = log(qz); 
+  lz *= ki;  
+  
+  d[0] = 1.0; 
+
+  for(k = 1; k < states; k++)
+    d[k] = exp (EIGN[k] * lz);
+  
+  term = 0.0;
+
+  for(k = 0; k < states; k++) 
+    term += x1[k] * x2[k] * d[k];       
+
+  /* note the "scale * log(PLL_MINLIKELIHOOD)" term here which we use to undo/revert the scaling multiplications 
+     such that we obtain a correct log likelihood score. The integer variable scale, contains the number of times 
+     we had to scale (multiply by 2^256) for site i only during a full tree traversal using Felsenstein's algorithm */
+
+  term = log(fabs(term)) + (scale * log(PLL_MINLIKELIHOOD));   
+
+  /* multiply with the site pattern weight (site pattern compression factor */
+
+  term = term * w;
+
+  /* free the memory space used for likelihood computations on this site */
+
+  rax_free(lVector);  
+  rax_free(d);
+
+  return  term;
+}
+#endif
+
+/* this is the top-level function that can be called from other parts of the code.
+   As input it takes the tree data structure, the site index, the evolutionary rate ki, 
+   and the model index (partition index. It will return the 
+   log likelihood of site i. 
+   An important pre-condition is that the tree traversal descriptor must contain 
+   a full tree traversal starting at a tip !
+
+   Note that, if you wamt to obtain per-site log likes for other altered model parameters such 
+   as the Q matrix, you will have do re-invoke the eigenvalue/eigenvector decomposition prior 
+   to calling the function below.
+*/
+
+double evaluatePartialGeneric (pllInstance *tr, partitionList *pr, int i, double ki, int _model)
+{
+  double 
+    result;
+  
+  
+  int     
+    branchReference,
+
+    /* number of states of the data type in this partition */
+    states = pr->partitionData[_model]->states;
+    
+  /* SOS ATTENTION: note the different indexing used for the parallel and sequential versions ! */
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  int index = i; 
+#else
+  int index = i - pr->partitionData[_model]->lower;
+#endif
+  
+  /* here we figure out if all partitions are linked via the same branch length, that is,
+     if we are conducting a joint branch length estimate or a per-partition branch length estimate */
+
+  if(pr->perGeneBranchLengths && pr->numberOfPartitions>1)
+    branchReference = _model;
+  else
+    branchReference = 0;
+
+  /* for the generic function implementation we only offer the CAT implementation for computing/optimizing per-site evolutionary rates */
+
+#if (!defined(__SSE3) && !defined(__AVX))
+  if(tr->rateHetModel == PLL_CAT)
+    result = evaluatePartialCAT_FLEX(index, ki, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference], 
+				     pr->partitionData[_model]->wgt[index],
+				     pr->partitionData[_model]->EIGN,
+				     pr->partitionData[_model]->EI,
+				     pr->partitionData[_model]->EV,
+				     pr->partitionData[_model]->tipVector,
+				     pr->partitionData[_model]->yVector, branchReference, tr->mxtips, states);
+  else
+    /* 
+       the per-site site likelihood function should only be called for the CAT model
+       under the GAMMA model this is required only for estimating per-site protein models 
+       which has however been removed in this version of the code
+    */
+    assert(0); 
+  
+ 
+#else
+  /* switch over the number of states of the data in the current model/partition */
+  switch(states)
+    {
+    case 2:   /* BINARY */
+      assert(!tr->saveMemory);
+      assert(tr->rateHetModel == PLL_CAT);
+
+      result = evaluatePartialGTRCAT_BINARY(index, ki, tr->td[0].count, tr->td[0].ti, 
+                                            tr->td[0].ti[0].qz[branchReference],
+                                            pr->partitionData[_model]->wgt[index],
+                                            pr->partitionData[_model]->EIGN,
+                                            pr->partitionData[_model]->EI,
+                                            pr->partitionData[_model]->EV,
+                                            pr->partitionData[_model]->tipVector,
+                                            pr->partitionData[_model]->yVector, 
+                                            branchReference, 
+                                            tr->mxtips);
+      break;
+      
+    case 4:   /* DNA */
+      /* switch over CAT versus GAMMA and pass all model parameters for the respective partition to the respective functions */
+      if(tr->rateHetModel == PLL_CAT)      
+	result = evaluatePartialGTRCAT(index, ki, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference], 
+				       pr->partitionData[_model]->wgt[index],
+				       pr->partitionData[_model]->EIGN,
+				       pr->partitionData[_model]->EI,
+				       pr->partitionData[_model]->EV,
+				       pr->partitionData[_model]->tipVector,
+				       pr->partitionData[_model]->yVector, branchReference, tr->mxtips);
+      else	
+	result = evaluatePartialGTRGAMMA(index, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference], 
+					 pr->partitionData[_model]->wgt[index],
+					 pr->partitionData[_model]->EIGN,
+					 pr->partitionData[_model]->EI,
+					 pr->partitionData[_model]->EV,
+					 pr->partitionData[_model]->tipVector,
+					 pr->partitionData[_model]->yVector,
+					 pr->partitionData[_model]->gammaRates,
+					 branchReference, tr->mxtips);	
+	
+      break;
+    case 20: /* proteins */     
+      if(tr->rateHetModel == PLL_CAT)
+	result = evaluatePartialGTRCATPROT(index, ki, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference], 
+					   pr->partitionData[_model]->wgt[index],
+					   pr->partitionData[_model]->EIGN,
+					   pr->partitionData[_model]->EI,
+					   pr->partitionData[_model]->EV,
+					   pr->partitionData[_model]->tipVector,
+					   pr->partitionData[_model]->yVector, branchReference, tr->mxtips);
+      else
+	result =  evaluatePartialGTRGAMMAPROT(index, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference], 
+					      pr->partitionData[_model]->wgt[index],
+					      pr->partitionData[_model]->EIGN,
+					      pr->partitionData[_model]->EI,
+					      pr->partitionData[_model]->EV,
+					      pr->partitionData[_model]->tipVector,
+					      pr->partitionData[_model]->yVector,
+					      pr->partitionData[_model]->gammaRates,
+					      branchReference, tr->mxtips);
+      break;   
+    default:
+      assert(0);
+    }
+  #endif
+ 
+
+  return result;
+}
+
+#if (defined(__SSE3) || defined(__AVX))
+/* optimized function implementations for computing per-site log likelihoods under CAT and GAMMA for protein and 
+   DNA data. 
+   The structure is analoguous as above with some data- and model-specific optimizations and vectorizations.
+*/
+
+static __inline void computeVectorGTRCAT_BINARY(double *lVector, int *eVector, double ki, int i, double qz, double rz,
+					      traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+					      unsigned char **yVector, int mxtips)
+{       
+  double  d1, d2,  ump_x1, ump_x2, x1px2[2], lz1, lz2; 
+  double *x1, *x2, *x3;
+  int 
+    j, k,
+    pNumber = ti->pNumber,
+    rNumber = ti->rNumber,
+    qNumber = ti->qNumber;
+ 
+  x3  = &lVector[2 * (pNumber  - mxtips)];  
+
+  switch(ti->tipCase)
+    {
+    case PLL_TIP_TIP:     
+      x1 = &(tipVector[2 * yVector[qNumber][i]]);
+      x2 = &(tipVector[2 * yVector[rNumber][i]]);   
+      break;
+    case PLL_TIP_INNER:     
+      x1 = &(tipVector[2 * yVector[qNumber][i]]);
+      x2 = &lVector[2 * (rNumber - mxtips)];                    
+      break;
+    case PLL_INNER_INNER:            
+      x1 = &lVector[2 * (qNumber - mxtips)];
+      x2 = &lVector[2 * (rNumber - mxtips)];               
+      break;
+    default:
+      assert(0);
+    }
+     
+  lz1 = qz * ki;  
+  lz2 = rz * ki;
+  
+ 
+  d1 = x1[1] * exp(EIGN[1] * lz1);
+  d2 = x2[1] * exp(EIGN[1] * lz2);	        
+ 
+  for(j = 0; j < 2; j++)
+    {     
+      ump_x1 = x1[0];
+      ump_x2 = x2[0];
+      
+      ump_x1 += d1 * EI[j * 2 + 1];
+      ump_x2 += d2 * EI[j * 2 + 1];
+	
+      x1px2[j] = ump_x1 * ump_x2;
+    }
+  
+  for(j = 0; j < 2; j++)
+    x3[j] = 0.0;
+
+  for(j = 0; j < 2; j++)          
+    for(k = 0; k < 2; k++)	
+      x3[k] +=  x1px2[j] *  EV[2 * j + k];	   
+      
+  
+  if (x3[0] < PLL_MINLIKELIHOOD && x3[0] > PLL_MINUSMINLIKELIHOOD &&
+      x3[1] < PLL_MINLIKELIHOOD && x3[1] > PLL_MINUSMINLIKELIHOOD 
+      )
+    {	     
+      x3[0]   *= PLL_TWOTOTHE256;
+      x3[1]   *= PLL_TWOTOTHE256;     
+      *eVector = *eVector + 1;
+    }	              
+
+  return;
+}
+
+static double evaluatePartialGTRCAT_BINARY(int i, double ki, int counter,  traversalInfo *ti, double qz,
+					   int w, double *EIGN, double *EI, double *EV,
+					   double *tipVector, unsigned  char **yVector, 
+					   int branchReference, int mxtips)
+{
+  double lz, term;       
+  double  d;
+  double   *x1, *x2; 
+  int scale = 0, k;
+  double *lVector = (double *)malloc(sizeof(double) * 2 * mxtips);  
+  traversalInfo *trav = &ti[0];
+ 
+  assert(isTip(trav->pNumber, mxtips));
+     
+  x1 = &(tipVector[2 *  yVector[trav->pNumber][i]]);   
+
+  for(k = 1; k < counter; k++)  
+    {
+      double 
+	qz = ti[k].qz[branchReference],
+	rz = ti[k].rz[branchReference];
+      
+      qz = (qz > PLL_ZMIN) ? log(qz) : log(PLL_ZMIN);
+      rz = (rz > PLL_ZMIN) ? log(rz) : log(PLL_ZMIN);
+
+      computeVectorGTRCAT_BINARY(lVector, &scale, ki, i, qz, rz, &ti[k], 
+				 EIGN, EI, EV, 
+				 tipVector, yVector, mxtips);       
+    }
+   
+  x2 = &lVector[2 * (trav->qNumber - mxtips)];
+     
+  assert(0 <=  (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips);  
+       
+  if(qz < PLL_ZMIN) 
+    lz = PLL_ZMIN;
+  lz  = log(qz); 
+  lz *= ki;  
+  
+  d = exp(EIGN[1] * lz);
+  
+  term =  x1[0] * x2[0];
+  term += x1[1] * x2[1] * d; 
+
+  term = log(fabs(term)) + (scale * log(PLL_MINLIKELIHOOD));   
+
+  term = term * w;
+
+  free(lVector);
+  
+  return  term;
+}
+
+
+
+static __inline void computeVectorGTRGAMMAPROT(double *lVector, int *eVector, double *gammaRates, int i, double qz, double rz,
+					     traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+					     unsigned  char **yVector, int mxtips)
+{       
+  double   
+    *x1, 
+    *x2, 
+    *x3;  
+  
+  int
+    s,
+    pNumber = ti->pNumber,
+    rNumber = ti->rNumber,
+    qNumber = ti->qNumber,
+    index1[4],
+    index2[4];
+  
+ 
+  x3  = &(lVector[80 * (pNumber  - mxtips)]);     
+
+  switch(ti->tipCase)
+    {
+    case PLL_TIP_TIP:    
+      x1 = &(tipVector[20 * yVector[qNumber][i]]);
+      x2 = &(tipVector[20 * yVector[rNumber][i]]);     
+      for(s = 0; s < 4; s++)
+	{
+	  index1[s] = 0;
+	  index2[s] = 0;
+	}
+      break;
+    case PLL_TIP_INNER:     
+      x1 = &(tipVector[20 * yVector[qNumber][i]]);
+      x2 = &(  lVector[80 * (rNumber - mxtips)]);   
+      for(s = 0; s < 4; s++)       
+	index1[s] = 0;
+      for(s = 0; s < 4; s++)     
+	index2[s] = s;                     
+      break;
+    case PLL_INNER_INNER:            
+      x1 = &(lVector[80 * (qNumber - mxtips)]);
+      x2 = &(lVector[80 * (rNumber - mxtips)]); 
+      for(s = 0; s < 4; s++)
+	{
+	  index1[s] = s;
+	  index2[s] = s;
+	}                
+      break;    
+    default:
+      assert(0);
+    }
+     
+  {
+	  PLL_ALIGN_BEGIN double
+		  e1[20] PLL_ALIGN_END,
+		  e2[20] PLL_ALIGN_END,
+		  d1[20] PLL_ALIGN_END,
+		  d2[20] PLL_ALIGN_END;
+    double  
+      lz1, lz2;  
+    int 
+      l, 
+      k, 
+      scale, 
+      j;
+     
+    for(j = 0; j < 4; j++)
+      {
+	lz1 = qz * gammaRates[j];            
+	lz2 = rz * gammaRates[j];        
+
+	e1[0] = 1.0;
+	e2[0] = 1.0;
+    
+	for(l = 1; l < 20; l++)
+	  {
+	    e1[l] = exp(EIGN[l] * lz1);
+	    e2[l] = exp(EIGN[l] * lz2);
+	  }
+
+	for(l = 0; l < 20; l+=2)
+	  {
+	    __m128d d1v = _mm_mul_pd(_mm_load_pd(&x1[20 * index1[j] + l]), _mm_load_pd(&e1[l]));
+	    __m128d d2v = _mm_mul_pd(_mm_load_pd(&x2[20 * index2[j] + l]), _mm_load_pd(&e2[l]));
+	    
+	    _mm_store_pd(&d1[l], d1v);
+	    _mm_store_pd(&d2[l], d2v);	
+	  }
+
+	__m128d zero = _mm_setzero_pd();
+
+	for(l = 0; l < 20; l+=2)
+	  _mm_store_pd(&x3[j * 20 + l], zero);
+                
+	for(l = 0; l < 20; l++)
+	  { 	      
+	    double *ev = &EV[l * 20];
+	    __m128d ump_x1v = _mm_setzero_pd();
+	    __m128d ump_x2v = _mm_setzero_pd();
+	    __m128d x1px2v;
+	    
+	    for(k = 0; k < 20; k+=2)
+	      {       
+		__m128d eiv = _mm_load_pd(&EI[20 * l + k]);
+		__m128d d1v = _mm_load_pd(&d1[k]);
+		__m128d d2v = _mm_load_pd(&d2[k]);
+		ump_x1v = _mm_add_pd(ump_x1v, _mm_mul_pd(d1v, eiv));
+		ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(d2v, eiv));	  
+	      }
+
+	    ump_x1v = _mm_hadd_pd(ump_x1v, ump_x1v);
+	    ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+
+	    x1px2v = _mm_mul_pd(ump_x1v, ump_x2v);
+
+	    for(k = 0; k < 20; k+=2)
+	      {
+		__m128d ex3v = _mm_load_pd(&x3[j * 20 + k]);
+		__m128d EVV  = _mm_load_pd(&ev[k]);
+		ex3v = _mm_add_pd(ex3v, _mm_mul_pd(x1px2v, EVV));
+		
+		_mm_store_pd(&x3[j * 20 + k], ex3v);	   	   
+	      }
+	  }        
+      }
+    
+    scale = 1;
+    for(l = 0; scale && (l < 80); l++)
+      scale = ((x3[l] < PLL_MINLIKELIHOOD) && (x3[l] > PLL_MINUSMINLIKELIHOOD));	       	      	      	       	       
+    
+    if(scale)
+      {	      
+	__m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+	for(l = 0; l < 80; l+=2)
+	  {
+	    __m128d ex3v = _mm_mul_pd(_mm_load_pd(&x3[l]),twoto);
+	    _mm_store_pd(&x3[l], ex3v);	
+	  }
+
+	*eVector = *eVector + 1;
+      }
+    
+    return;      
+  }
+}
+
+static  void computeVectorGTRGAMMA(double *lVector, int *eVector, double *gammaRates, int i, double qz, double rz,
+					 traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+					 unsigned  char **yVector, int mxtips)
+{       
+  double   
+    *x1, 
+    *x2, 
+    *x3;   
+
+  int
+    s,
+    pNumber = ti->pNumber,
+    rNumber = ti->rNumber,
+    qNumber = ti->qNumber,
+    index1[4],
+    index2[4];
+  
+ 
+  x3  = &(lVector[16 * (pNumber  - mxtips)]);     
+
+  switch(ti->tipCase)
+    {
+    case PLL_TIP_TIP:          
+      x1 = &(tipVector[4 * yVector[qNumber][i]]);
+      x2 = &(tipVector[4 * yVector[rNumber][i]]);     
+      
+      for(s = 0; s < 4; s++)
+	{
+	  index1[s] = 0;
+	  index2[s] = 0;
+	}
+      break;
+    case PLL_TIP_INNER:     
+      x1 = &(tipVector[4 * yVector[qNumber][i]]);
+      x2 = &(lVector[16 * (rNumber - mxtips)]);   
+      for(s = 0; s < 4; s++)       
+	{
+	  index1[s] = 0;      
+	  index2[s] = s;  
+	}
+      break;
+    case PLL_INNER_INNER:            
+      x1 = &(lVector[16 * (qNumber - mxtips)]);
+      x2 = &(lVector[16 * (rNumber - mxtips)]);       
+      for(s = 0; s < 4; s++)
+	{
+	  index1[s] = s;
+	  index2[s] = s;
+	}                
+      break;    
+    default:
+      assert(0);
+    }
+     
+  {
+	  PLL_ALIGN_BEGIN double
+		  e1[20] PLL_ALIGN_END,
+		  e2[20] PLL_ALIGN_END,
+		  d1[20] PLL_ALIGN_END,
+		  d2[20] PLL_ALIGN_END;
+    double  
+      lz1, lz2;  
+    
+    int 
+      l, 
+      k, 
+      scale, 
+      j;
+     
+    for(j = 0; j < 4; j++)
+      {
+	lz1 = qz * gammaRates[j];            
+	lz2 = rz * gammaRates[j];        
+
+	e1[0] = 1.0;
+	e2[0] = 1.0;
+    
+	for(l = 1; l < 4; l++)
+	  {
+	    e1[l] = exp(EIGN[l] * lz1);
+	    e2[l] = exp(EIGN[l] * lz2);
+	  }
+
+	for(l = 0; l < 4; l+=2)
+	  {
+	    __m128d d1v = _mm_mul_pd(_mm_load_pd(&x1[4 * index1[j] + l]), _mm_load_pd(&e1[l]));
+	    __m128d d2v = _mm_mul_pd(_mm_load_pd(&x2[4 * index2[j] + l]), _mm_load_pd(&e2[l]));
+	    
+	    _mm_store_pd(&d1[l], d1v);
+	    _mm_store_pd(&d2[l], d2v);	
+	  }
+
+	__m128d zero = _mm_setzero_pd();
+
+	for(l = 0; l < 4; l+=2)
+	  _mm_store_pd(&x3[j * 4 + l], zero);
+                
+	for(l = 0; l < 4; l++)
+	  { 	      
+	    double *ev = &EV[l * 4];
+	    __m128d ump_x1v = _mm_setzero_pd();
+	    __m128d ump_x2v = _mm_setzero_pd();
+	    __m128d x1px2v;
+	    
+	    for(k = 0; k < 4; k+=2)
+	      {       
+		__m128d eiv = _mm_load_pd(&EI[4 * l + k]);
+		__m128d d1v = _mm_load_pd(&d1[k]);
+		__m128d d2v = _mm_load_pd(&d2[k]);
+		ump_x1v = _mm_add_pd(ump_x1v, _mm_mul_pd(d1v, eiv));
+		ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(d2v, eiv));	  
+	      }
+
+	    ump_x1v = _mm_hadd_pd(ump_x1v, ump_x1v);
+	    ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+
+	    x1px2v = _mm_mul_pd(ump_x1v, ump_x2v);
+
+	    for(k = 0; k < 4; k+=2)
+	      {
+		__m128d ex3v = _mm_load_pd(&x3[j * 4 + k]);
+		__m128d EVV  = _mm_load_pd(&ev[k]);
+		ex3v = _mm_add_pd(ex3v, _mm_mul_pd(x1px2v, EVV));
+		
+		_mm_store_pd(&x3[j * 4 + k], ex3v);	   	   
+	      }
+	  }        
+      }
+    
+  
+    scale = 1;
+    for(l = 0; scale && (l < 16); l++)
+      scale = (PLL_ABS(x3[l]) < PLL_MINLIKELIHOOD);	       	      	      	       	       
+    
+    if(scale)
+      {	      
+	__m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+	
+	for(l = 0; l < 16; l+=2)
+	  {
+	    __m128d ex3v = _mm_mul_pd(_mm_load_pd(&x3[l]),twoto);
+	    _mm_store_pd(&x3[l], ex3v);	
+	  }
+	
+	*eVector = *eVector + 1;
+      }  
+    
+    return;      
+  }
+}
+
+
+static double evaluatePartialGTRGAMMAPROT(int i, int counter,  traversalInfo *ti, double qz,
+					  int w, double *EIGN, double *EI, double *EV,
+					  double *tipVector, unsigned char **yVector, 
+					  double *gammaRates,
+					  int branchReference, int mxtips)
+{
+  double lz, term;       
+  double  d[80];
+  double   *x1, *x2; 
+  int scale = 0, k, l, j;
+
+  double 
+	  *lVector = NULL;
+  PLL_ALIGN_BEGIN double
+	  myEI[400]  PLL_ALIGN_END;
+
+  traversalInfo 
+    *trav = &ti[0];
+
+  rax_posix_memalign ((void **)&lVector, PLL_BYTE_ALIGNMENT, sizeof(double) * 80 * mxtips);
+
+  for(k = 0; k < 20; k++)
+    {         
+      for(l = 0; l < 20; l++)
+	myEI[k * 20 + l] = EI[k * 20 + l];
+    }
+
+  assert(isTip(trav->pNumber, mxtips));
+     
+  x1 = &(tipVector[20 *  yVector[trav->pNumber][i]]);   
+
+  for(k = 1; k < counter; k++)                
+    {
+      double 
+	qz = ti[k].qz[branchReference],
+	rz = ti[k].rz[branchReference];
+      
+      qz = (qz > PLL_ZMIN) ? log(qz) : log(PLL_ZMIN);
+      rz = (rz > PLL_ZMIN) ? log(rz) : log(PLL_ZMIN);
+
+      computeVectorGTRGAMMAPROT(lVector, &scale, gammaRates, i, qz, rz, 
+				&ti[k], EIGN, myEI, EV, 
+				tipVector, yVector, mxtips);
+    }
+   
+  x2 = &lVector[80 * (trav->qNumber - mxtips)];       
+
+  assert(0 <=  (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips);  
+  
+  lz = qz;
+
+  if(qz < PLL_ZMIN) 
+    lz = PLL_ZMIN;
+  lz  = log(qz);
+  
+  
+  
+  for(j = 0; j < 4; j++)
+    {
+      d[20 * j] = 1.0;
+      for(l = 1; l < 20; l++)
+	d[20 * j + l] = exp(EIGN[l] * lz * gammaRates[j]);
+    }
+
+ 
+  for(j = 0, term = 0.0; j < 4; j++)
+    {
+      for(l = 0; l < 20; l++)
+	term += x1[l] * x2[20 * j + l] * d[j * 20 + l];	      
+    }
+  
+  term = log(0.25 * fabs(term)) + (scale * log(PLL_MINLIKELIHOOD));   
+
+  term = term * w;
+
+ rax_free(lVector);
+  
+ 
+  return  term;
+}
+
+static double evaluatePartialGTRGAMMA(int i, int counter,  traversalInfo *ti, double qz,
+				      int w, double *EIGN, double *EI, double *EV,
+				      double *tipVector, unsigned char **yVector, 
+				      double *gammaRates,
+				      int branchReference, int mxtips)
+{
+  double lz, term;       
+  double  d[16];
+  double   *x1, *x2; 
+  int scale = 0, k, l, j;
+  double 
+	  *lVector = NULL;
+  PLL_ALIGN_BEGIN double
+	  myEI[16]  PLL_ALIGN_END;
+
+
+  traversalInfo 
+    *trav = &ti[0];
+
+  rax_posix_memalign ((void **)&lVector, PLL_BYTE_ALIGNMENT, sizeof(double) * 16 * mxtips);
+
+  for(k = 0; k < 4; k++)
+    {           
+      for(l = 0; l < 4; l++)
+	myEI[k * 4 + l] = EI[k * 4 + l];
+    }
+
+  assert(isTip(trav->pNumber, mxtips));
+     
+  x1 = &(tipVector[4 *  yVector[trav->pNumber][i]]);   
+
+  for(k = 1; k < counter; k++)                
+    {
+      double 
+	qz = ti[k].qz[branchReference],
+	rz = ti[k].rz[branchReference];
+      
+      qz = (qz > PLL_ZMIN) ? log(qz) : log(PLL_ZMIN);
+      rz = (rz > PLL_ZMIN) ? log(rz) : log(PLL_ZMIN);
+
+      computeVectorGTRGAMMA(lVector, &scale, gammaRates, i, qz, rz, 
+				&ti[k], EIGN, myEI, EV, 
+				tipVector, yVector, mxtips);
+    }
+   
+  x2 = &lVector[16 * (trav->qNumber - mxtips)];       
+
+  assert(0 <=  (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips);  
+  
+  if(qz < PLL_ZMIN) 
+    lz = PLL_ZMIN;
+  lz  = log(qz); 
+  
+  for(j = 0; j < 4; j++)
+    {
+      d[4 * j] = 1.0;
+      for(l = 1; l < 4; l++)
+	d[4 * j + l] = exp(EIGN[l] * lz * gammaRates[j]);
+    }
+
+ 
+  for(j = 0, term = 0.0; j < 4; j++)
+    {
+      for(l = 0; l < 4; l++)
+	term += x1[l] * x2[4 * j + l] * d[j * 4 + l];	      
+    }
+
+  term = log(0.25 * fabs(term)) + (scale * log(PLL_MINLIKELIHOOD));   
+
+  term = term * w;
+
+  rax_free(lVector);
+  
+  
+  return  term;
+}
+
+
+
+
+static __inline void computeVectorGTRCAT(double *lVector, int *eVector, double ki, int i, double qz, double rz,
+				       traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+				       unsigned char **yVector, int mxtips)
+{       
+  double  d1[3], d2[3],  ump_x1, ump_x2, x1px2[4], lz1, lz2; 
+  double *x1, *x2, *x3;
+  int j, k,
+    pNumber = ti->pNumber,
+    rNumber = ti->rNumber,
+    qNumber = ti->qNumber;
+ 
+  x3  = &lVector[4 * (pNumber  - mxtips)];  
+ 
+
+  switch(ti->tipCase)
+    {
+    case PLL_TIP_TIP:     
+      x1 = &(tipVector[4 * yVector[qNumber][i]]);
+      x2 = &(tipVector[4 * yVector[rNumber][i]]);    
+      break;
+    case PLL_TIP_INNER:     
+      x1 = &(tipVector[4 * yVector[qNumber][i]]);
+      x2 = &lVector[4 * (rNumber - mxtips)];           
+      break;
+    case PLL_INNER_INNER:            
+      x1 = &lVector[4 * (qNumber - mxtips)];
+      x2 = &lVector[4 * (rNumber - mxtips)];     
+      break;
+    default:
+      assert(0);
+    }
+     
+  lz1 = qz * ki;  
+  lz2 = rz * ki;
+  
+  for(j = 0; j < 3; j++)
+    {
+      d1[j] = 
+	x1[j + 1] * 
+	exp(EIGN[j + 1] * lz1);
+      d2[j] = x2[j + 1] * exp(EIGN[j + 1] * lz2);	    
+    }
+ 
+ 
+  for(j = 0; j < 4; j++)
+    {     
+      ump_x1 = x1[0];
+      ump_x2 = x2[0];
+      for(k = 0; k < 3; k++)
+	{
+	  ump_x1 += d1[k] * EI[j * 4 + k + 1];
+	  ump_x2 += d2[k] * EI[j * 4 + k + 1];
+	}
+      x1px2[j] = ump_x1 * ump_x2;
+    }
+  
+  for(j = 0; j < 4; j++)
+    x3[j] = 0.0;
+
+  for(j = 0; j < 4; j++)          
+    for(k = 0; k < 4; k++)	
+      x3[k] +=  x1px2[j] *  EV[4 * j + k];	   
+      
+  
+  if (x3[0] < PLL_MINLIKELIHOOD && x3[0] > PLL_MINUSMINLIKELIHOOD &&
+      x3[1] < PLL_MINLIKELIHOOD && x3[1] > PLL_MINUSMINLIKELIHOOD &&
+      x3[2] < PLL_MINLIKELIHOOD && x3[2] > PLL_MINUSMINLIKELIHOOD &&
+      x3[3] < PLL_MINLIKELIHOOD && x3[3] > PLL_MINUSMINLIKELIHOOD)
+    {	     
+      x3[0]   *= PLL_TWOTOTHE256;
+      x3[1]   *= PLL_TWOTOTHE256;
+      x3[2]   *= PLL_TWOTOTHE256;     
+      x3[3]   *= PLL_TWOTOTHE256;     
+      *eVector = *eVector + 1;
+    }	              
+
+  return;
+}
+
+
+
+
+
+
+
+
+static double evaluatePartialGTRCAT(int i, double ki, int counter,  traversalInfo *ti, double qz,
+				    int w, double *EIGN, double *EI, double *EV,
+				    double *tipVector, unsigned  char **yVector, 
+				    int branchReference, int mxtips)
+{
+  double lz, term;       
+  double  d[3];
+  double   *x1, *x2, *lVector = NULL; 
+  int scale = 0, k;
+  traversalInfo *trav = &ti[0];
+ 
+  rax_posix_memalign ((void **) &lVector, PLL_BYTE_ALIGNMENT, sizeof(double) * 4 * mxtips);    
+
+  assert(isTip(trav->pNumber, mxtips));
+     
+  x1 = &(tipVector[4 *  yVector[trav->pNumber][i]]);   
+
+  for(k = 1; k < counter; k++)    
+    {
+      double 
+	qz = ti[k].qz[branchReference],
+	rz = ti[k].rz[branchReference];
+      
+      qz = (qz > PLL_ZMIN) ? log(qz) : log(PLL_ZMIN);
+      rz = (rz > PLL_ZMIN) ? log(rz) : log(PLL_ZMIN);
+
+      computeVectorGTRCAT(lVector, &scale, ki, i, qz, rz, &ti[k], 
+			  EIGN, EI, EV, 
+			  tipVector, yVector, mxtips);       
+    }
+   
+  x2 = &lVector[4 * (trav->qNumber - mxtips)]; 
+
+  assert(0 <=  (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips);  
+       
+  if(qz < PLL_ZMIN) 
+    lz = PLL_ZMIN;
+  lz  = log(qz); 
+  lz *= ki;  
+  
+  d[0] = exp (EIGN[1] * lz);
+  d[1] = exp (EIGN[2] * lz);
+  d[2] = exp (EIGN[3] * lz);       	   
+  
+  term =  x1[0] * x2[0];
+  term += x1[1] * x2[1] * d[0];
+  term += x1[2] * x2[2] * d[1];
+  term += x1[3] * x2[3] * d[2];     
+
+  term = log(fabs(term)) + (scale * log(PLL_MINLIKELIHOOD));   
+
+  term = term * w;
+
+  rax_free(lVector);  
+
+  return  term;
+}
+
+/**********************************************************************************/
+
+static __inline void computeVectorGTRCATPROT(double *lVector, int *eVector, double ki, int i, double qz, double rz,
+				       traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+				       unsigned char **yVector, int mxtips)
+{       
+  double  d1[20], d2[20],  ump_x1, ump_x2, x1px2[20], lz1, lz2; 
+  double *x1, *x2, *x3;
+  int j, k,
+    scale = 1,
+    pNumber = ti->pNumber,
+    rNumber = ti->rNumber,
+    qNumber = ti->qNumber;
+ 
+  x3  = &lVector[20 * (pNumber  - mxtips)];  
+ 
+
+  switch(ti->tipCase)
+    {
+    case PLL_TIP_TIP:     
+      x1 = &(tipVector[20 * yVector[qNumber][i]]);
+      x2 = &(tipVector[20 * yVector[rNumber][i]]);    
+      break;
+    case PLL_TIP_INNER:     
+      x1 = &(tipVector[20 * yVector[qNumber][i]]);
+      x2 = &lVector[20 * (rNumber - mxtips)];           
+      break;
+    case PLL_INNER_INNER:            
+      x1 = &lVector[20 * (qNumber - mxtips)];
+      x2 = &lVector[20 * (rNumber - mxtips)];     
+      break;
+    default:
+      assert(0);
+    }
+     
+  lz1 = qz * ki;  
+  lz2 = rz * ki;
+  
+   d1[0] = x1[0];
+   d2[0] = x2[0];
+
+  for(j = 1; j < 20; j++)
+    {
+      d1[j] = x1[j] * exp(EIGN[j] * lz1);
+      d2[j] = x2[j] * exp(EIGN[j] * lz2);	    
+    }
+ 
+ 
+  for(j = 0; j < 20; j++)
+    {        
+      ump_x1 = 0;
+      ump_x2 = 0;
+
+      for(k = 0; k < 20; k++)
+	{
+	  ump_x1 += d1[k] * EI[j * 20 + k];
+	  ump_x2 += d2[k] * EI[j * 20 + k];
+	}
+      
+      x1px2[j] = ump_x1 * ump_x2;
+    }
+  
+  for(j = 0; j < 20; j++)
+    x3[j] = 0.0;
+
+  for(j = 0; j < 20; j++)          
+    for(k = 0; k < 20; k++)	
+      x3[k] +=  x1px2[j] *  EV[20 * j + k];	   
+      
+  scale = 1;
+  for(k = 0; (k < 20) && scale; k++)    
+    scale = ((x3[k] < PLL_MINLIKELIHOOD) && (x3[k] > PLL_MINUSMINLIKELIHOOD));    
+
+  if(scale)
+    {	        
+
+      for(k = 0; k < 20; k++)
+	x3[k]   *= PLL_TWOTOTHE256;
+         
+      *eVector = *eVector + 1;
+    }	              
+
+  return;
+}
+
+
+
+
+
+
+
+
+static double evaluatePartialGTRCATPROT(int i, double ki, int counter,  traversalInfo *ti, double qz,
+				    int w, double *EIGN, double *EI, double *EV,
+				    double *tipVector, unsigned  char **yVector, 
+				    int branchReference, int mxtips)
+{
+  double lz, term;       
+  double  d[20];
+  double   *x1, *x2, *lVector = NULL; 
+  int scale = 0, k;
+
+  traversalInfo *trav = &ti[0];
+ 
+  rax_posix_memalign ((void **)&lVector, PLL_BYTE_ALIGNMENT, sizeof(double) * 20 * mxtips);
+
+  assert(isTip(trav->pNumber, mxtips));
+     
+  x1 = &(tipVector[20 *  yVector[trav->pNumber][i]]);   
+
+  for(k = 1; k < counter; k++)    
+    {
+      double 
+	qz = ti[k].qz[branchReference],
+	rz = ti[k].rz[branchReference];
+      
+      qz = (qz > PLL_ZMIN) ? log(qz) : log(PLL_ZMIN);
+      rz = (rz > PLL_ZMIN) ? log(rz) : log(PLL_ZMIN);
+
+      computeVectorGTRCATPROT(lVector, &scale, ki, i, qz, rz, &ti[k], 
+			  EIGN, EI, EV, 
+			  tipVector, yVector, mxtips);       
+    }
+   
+  x2 = &lVector[20 * (trav->qNumber - mxtips)]; 
+
+  assert(0 <=  (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips);  
+       
+  if(qz < PLL_ZMIN) 
+    lz = PLL_ZMIN;
+  lz  = log(qz); 
+  lz *= ki;  
+  
+  d[0] = 1.0;
+  
+  for(k = 1; k < 20; k++)
+    d[k] =  exp (EIGN[k] * lz);
+
+        	   
+  term =  0.0;
+  for(k = 0; k < 20; k++)
+    term += x1[k] * x2[k] * d[k];     
+
+  term = log(fabs(term)) + (scale * log(PLL_MINLIKELIHOOD));   
+
+  term = term * w;
+
+  rax_free(lVector);  
+
+  return  term;
+}
+
+/******************************************/
+
+
+
+#endif
diff --git a/pll/fastDNAparsimony.c b/pll/fastDNAparsimony.c
new file mode 100644
index 0000000..1076465
--- /dev/null
+++ b/pll/fastDNAparsimony.c
@@ -0,0 +1,1942 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file fastDNAparsimony.c
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h>  
+#endif
+
+#include <limits.h>
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include <assert.h>
+
+#if defined(__MIC_NATIVE)
+
+#include <immintrin.h>
+
+#define INTS_PER_VECTOR 16
+//#define LONG_INTS_PER_VECTOR 8
+#define LONG_INTS_PER_VECTOR (64/sizeof(long))
+#define INT_TYPE __m512i
+#define CAST double*
+#define SET_ALL_BITS_ONE _mm512_set1_epi32(0xFFFFFFFF)
+#define SET_ALL_BITS_ZERO _mm512_setzero_epi32()
+#define VECTOR_LOAD _mm512_load_epi32
+#define VECTOR_STORE  _mm512_store_epi32
+#define VECTOR_BIT_AND _mm512_and_epi32
+#define VECTOR_BIT_OR  _mm512_or_epi32
+#define VECTOR_AND_NOT _mm512_andnot_epi32
+
+#elif defined(__AVX)
+
+#include <xmmintrin.h>
+#include <immintrin.h>
+#include <pmmintrin.h>
+
+#define INTS_PER_VECTOR 8
+//#define LONG_INTS_PER_VECTOR 4
+#define LONG_INTS_PER_VECTOR (32/sizeof(long))
+#define INT_TYPE __m256d
+#define CAST double*
+//#define SET_ALL_BITS_ONE (__m256d)_mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
+//#define SET_ALL_BITS_ZERO (__m256d)_mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)
+#define SET_ALL_BITS_ONE _mm256_castsi256_pd(_mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
+#define SET_ALL_BITS_ZERO _mm256_castsi256_pd(_mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000))
+#define VECTOR_LOAD _mm256_load_pd
+#define VECTOR_BIT_AND _mm256_and_pd
+#define VECTOR_BIT_OR  _mm256_or_pd
+#define VECTOR_STORE  _mm256_store_pd
+#define VECTOR_AND_NOT _mm256_andnot_pd
+
+#elif (defined(__SSE3))
+
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+  
+#define INTS_PER_VECTOR 4
+#ifdef __i386__
+//#define LONG_INTS_PER_VECTOR 4
+#define LONG_INTS_PER_VECTOR (16/sizeof(long))
+#else
+//#define LONG_INTS_PER_VECTOR 2
+#define LONG_INTS_PER_VECTOR (16/sizeof(long))
+#endif
+#define INT_TYPE __m128i
+#define CAST __m128i*
+#define SET_ALL_BITS_ONE _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
+#define SET_ALL_BITS_ZERO _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000)
+#define VECTOR_LOAD _mm_load_si128
+#define VECTOR_BIT_AND _mm_and_si128
+#define VECTOR_BIT_OR  _mm_or_si128
+#define VECTOR_STORE  _mm_store_si128
+#define VECTOR_AND_NOT _mm_andnot_si128
+
+#endif
+
+
+#include "pll.h"
+#include "pllInternal.h"
+
+#if defined (_MSC_VER)
+#	if defined ( __SSE4_2__ ) || defined (__AVX__)
+#		include <nmmintrin.h>
+#		define __builtin_popcount _mm_popcnt_u32
+#		define __builtin_popcountl _mm_popcnt_u64
+#	else
+#		include <intrin.h>
+	static __inline uint32_t __builtin_popcount (uint32_t a) {
+		// popcnt instruction not available
+		uint32_t b = a - ((a >> 1) & 0x55555555);
+		uint32_t c = (b & 0x33333333) + ((b >> 2) & 0x33333333);
+		uint32_t d = (c + (c >> 4)) & 0x0F0F0F0F;
+		uint32_t e = d * 0x01010101;
+		return   e >> 24;
+	}
+//#		define __builtin_popcount __popcnt
+#		define __builtin_popcountl __popcnt64
+#	endif
+#endif
+
+static pllBoolean tipHomogeneityCheckerPars(pllInstance *tr, nodeptr p, int grouping);
+
+extern const unsigned int mask32[32]; 
+/* vector-specific stuff */
+
+
+extern double masterTime;
+
+/************************************************ pop count stuff ***********************************************/
+
+ unsigned int bitcount_32_bit(unsigned int i)
+{
+  return ((unsigned int) __builtin_popcount(i));
+}
+
+/* bit count for 64 bit integers */
+
+//__inline unsigned int bitcount_64_bit(uint64_t i)
+//{
+//  return ((unsigned int) __builtin_popcountl(i));
+//}
+
+/* bit count for 128 bit SSE3 and 256 bit AVX registers */
+
+#if (defined(__SSE3) || defined(__AVX))
+
+#ifdef _WIN32
+ /* emulate with 32-bit version */
+static __inline unsigned int vectorPopcount(INT_TYPE v)
+{
+PLL_ALIGN_BEGIN unsigned int counts[INTS_PER_VECTOR] PLL_ALIGN_END;
+
+  int
+    i,
+    sum = 0;
+
+  VECTOR_STORE((CAST)counts, v);
+
+  for(i = 0; i < INTS_PER_VECTOR; i++)
+    sum += __builtin_popcount(counts[i]);
+
+  return ((unsigned int)sum);
+}
+#else
+
+static __inline unsigned int vectorPopcount(INT_TYPE v)
+{
+  unsigned long
+    counts[LONG_INTS_PER_VECTOR] __attribute__ ((aligned (PLL_BYTE_ALIGNMENT)));
+
+  int    
+    i,
+    sum = 0;
+  
+  VECTOR_STORE((CAST)counts, v);
+
+  for(i = 0; i < LONG_INTS_PER_VECTOR; i++)
+    sum += __builtin_popcountl(counts[i]);
+             
+  return ((unsigned int)sum);
+}
+#endif
+
+#endif
+
+
+
+/********************************DNA FUNCTIONS *****************************************************************/
+
+
+static int checkerPars(pllInstance *tr, nodeptr p)
+{
+  int group = tr->constraintVector[p->number];
+
+  if(isTip(p->number, tr->mxtips))
+    {
+      group = tr->constraintVector[p->number];
+      return group;
+    }
+  else
+    {
+      if(group != -9) 
+        return group;
+
+      group = checkerPars(tr, p->next->back);
+      if(group != -9) 
+        return group;
+
+      group = checkerPars(tr, p->next->next->back);
+      if(group != -9) 
+        return group;
+
+      return -9;
+    }
+}
+
+static pllBoolean tipHomogeneityCheckerPars(pllInstance *tr, nodeptr p, int grouping)
+{
+  if(isTip(p->number, tr->mxtips))
+    {
+      if(tr->constraintVector[p->number] != grouping) 
+        return PLL_FALSE;
+      else 
+        return PLL_TRUE;
+    }
+  else
+    {   
+      return  (tipHomogeneityCheckerPars(tr, p->next->back, grouping) && tipHomogeneityCheckerPars(tr, p->next->next->back,grouping));      
+    }
+}
+
+static void getxnodeLocal (nodeptr p)
+{
+  nodeptr  s;
+
+  if((s = p->next)->xPars || (s = s->next)->xPars)
+    {
+      p->xPars = s->xPars;
+      s->xPars = 0;
+    }
+
+  assert(p->next->xPars || p->next->next->xPars || p->xPars);
+
+}
+
+static void computeTraversalInfoParsimony(nodeptr p, int *ti, int *counter, int maxTips, pllBoolean full)
+{        
+  nodeptr 
+    q = p->next->back,
+    r = p->next->next->back;
+  
+  if(! p->xPars)
+    getxnodeLocal(p);  
+  
+  if(full)
+    {
+       if(q->number > maxTips) 
+         computeTraversalInfoParsimony(q, ti, counter, maxTips, full);
+      
+      if(r->number > maxTips) 
+        computeTraversalInfoParsimony(r, ti, counter, maxTips, full);
+    }
+  else
+    {
+      if(q->number > maxTips && !q->xPars) 
+        computeTraversalInfoParsimony(q, ti, counter, maxTips, full);
+      
+      if(r->number > maxTips && !r->xPars) 
+        computeTraversalInfoParsimony(r, ti, counter, maxTips, full);
+    }
+  
+  
+  ti[*counter]     = p->number;
+  ti[*counter + 1] = q->number;
+  ti[*counter + 2] = r->number;
+  *counter = *counter + 4;
+}
+
+
+
+
+
+
+
+#if (defined(__SSE3) || defined(__AVX))
+
+static void newviewParsimonyIterativeFast(pllInstance *tr, partitionList *pr)
+{    
+  INT_TYPE
+    allOne = SET_ALL_BITS_ONE;
+
+  int 
+    model,
+    *ti = tr->ti,
+    count = ti[0],
+    index; 
+
+  for(index = 4; index < count; index += 4)
+    {      
+      unsigned int
+        totalScore = 0;
+
+      size_t
+        pNumber = (size_t)ti[index],
+        qNumber = (size_t)ti[index + 1],
+        rNumber = (size_t)ti[index + 2];
+      
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        {
+          size_t
+            k,
+            states = pr->partitionData[model]->states,
+            width = pr->partitionData[model]->parsimonyLength;
+            
+          unsigned int  
+            i;      
+                 
+          switch(states)
+            {
+            case 2:       
+              {
+                parsimonyNumber
+                  *left[2],
+                  *right[2],
+                  *this[2];
+
+                for(k = 0; k < 2; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 2 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    INT_TYPE
+                      s_r, s_l, v_N,
+                      l_A, l_C,
+                      v_A, v_C;          
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[0][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[0][i]));
+                    l_A = VECTOR_BIT_AND(s_l, s_r);
+                    v_A = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[1][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[1][i]));
+                    l_C = VECTOR_BIT_AND(s_l, s_r);
+                    v_C = VECTOR_BIT_OR(s_l, s_r);                                                                
+                    
+                    v_N = VECTOR_BIT_OR(l_A, l_C);
+                    
+                    VECTOR_STORE((CAST)(&this[0][i]), VECTOR_BIT_OR(l_A, VECTOR_AND_NOT(v_N, v_A)));
+                    VECTOR_STORE((CAST)(&this[1][i]), VECTOR_BIT_OR(l_C, VECTOR_AND_NOT(v_N, v_C)));                                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);            
+                  }
+              }
+              break;
+            case 4:
+              {
+                parsimonyNumber
+                  *left[4],
+                  *right[4],
+                  *this[4];
+
+                for(k = 0; k < 4; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 4 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    INT_TYPE
+                      s_r, s_l, v_N,
+                      l_A, l_C, l_G, l_T,
+                      v_A, v_C, v_G, v_T;                
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[0][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[0][i]));
+                    l_A = VECTOR_BIT_AND(s_l, s_r);
+                    v_A = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[1][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[1][i]));
+                    l_C = VECTOR_BIT_AND(s_l, s_r);
+                    v_C = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[2][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[2][i]));
+                    l_G = VECTOR_BIT_AND(s_l, s_r);
+                    v_G = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[3][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[3][i]));
+                    l_T = VECTOR_BIT_AND(s_l, s_r);
+                    v_T = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    v_N = VECTOR_BIT_OR(VECTOR_BIT_OR(l_A, l_C), VECTOR_BIT_OR(l_G, l_T));                                
+                    
+                    VECTOR_STORE((CAST)(&this[0][i]), VECTOR_BIT_OR(l_A, VECTOR_AND_NOT(v_N, v_A)));
+                    VECTOR_STORE((CAST)(&this[1][i]), VECTOR_BIT_OR(l_C, VECTOR_AND_NOT(v_N, v_C)));
+                    VECTOR_STORE((CAST)(&this[2][i]), VECTOR_BIT_OR(l_G, VECTOR_AND_NOT(v_N, v_G)));
+                    VECTOR_STORE((CAST)(&this[3][i]), VECTOR_BIT_OR(l_T, VECTOR_AND_NOT(v_N, v_T)));                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);  
+                  }
+              }
+              break;
+            case 20:
+              {
+                parsimonyNumber
+                  *left[20],
+                  *right[20],
+                  *this[20];
+
+                for(k = 0; k < 20; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 20 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    size_t j;
+                    
+                    INT_TYPE
+                      s_r, s_l, 
+                      v_N = SET_ALL_BITS_ZERO,
+                      l_A[20], 
+                      v_A[20];           
+                    
+                    for(j = 0; j < 20; j++)
+                      {
+                        s_l = VECTOR_LOAD((CAST)(&left[j][i]));
+                        s_r = VECTOR_LOAD((CAST)(&right[j][i]));
+                        l_A[j] = VECTOR_BIT_AND(s_l, s_r);
+                        v_A[j] = VECTOR_BIT_OR(s_l, s_r);
+                        
+                        v_N = VECTOR_BIT_OR(v_N, l_A[j]);
+                      }
+                    
+                    for(j = 0; j < 20; j++)                 
+                      VECTOR_STORE((CAST)(&this[j][i]), VECTOR_BIT_OR(l_A[j], VECTOR_AND_NOT(v_N, v_A[j])));                                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);
+                  }
+              }
+              break;
+            default:
+              {
+                parsimonyNumber
+                  *left[32], 
+                  *right[32],
+                  *this[32];
+
+                assert(states <= 32);
+                
+                for(k = 0; k < states; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * states * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * states * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * states * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    size_t j;
+                    
+                    INT_TYPE
+                      s_r, s_l, 
+                      v_N = SET_ALL_BITS_ZERO,
+                      l_A[32], 
+                      v_A[32];           
+                    
+                    for(j = 0; j < states; j++)
+                      {
+                        s_l = VECTOR_LOAD((CAST)(&left[j][i]));
+                        s_r = VECTOR_LOAD((CAST)(&right[j][i]));
+                        l_A[j] = VECTOR_BIT_AND(s_l, s_r);
+                        v_A[j] = VECTOR_BIT_OR(s_l, s_r);
+                        
+                        v_N = VECTOR_BIT_OR(v_N, l_A[j]);
+                      }
+                    
+                    for(j = 0; j < states; j++)             
+                      VECTOR_STORE((CAST)(&this[j][i]), VECTOR_BIT_OR(l_A[j], VECTOR_AND_NOT(v_N, v_A[j])));                                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);
+                  }                             
+              }
+            }            
+        }
+
+      tr->parsimonyScore[pNumber] = totalScore + tr->parsimonyScore[rNumber] + tr->parsimonyScore[qNumber];      
+    }
+}
+
+
+
+static unsigned int evaluateParsimonyIterativeFast(pllInstance *tr, partitionList *pr)
+{
+  INT_TYPE 
+    allOne = SET_ALL_BITS_ONE;
+
+  size_t 
+    pNumber = (size_t)tr->ti[1],
+    qNumber = (size_t)tr->ti[2];
+
+  int
+    model;
+
+  unsigned int 
+    bestScore = tr->bestParsimony,    
+    sum;
+
+  if(tr->ti[0] > 4)
+    newviewParsimonyIterativeFast(tr, pr);
+
+  sum = tr->parsimonyScore[pNumber] + tr->parsimonyScore[qNumber];
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      size_t
+        k,
+        states = pr->partitionData[model]->states,
+        width  = pr->partitionData[model]->parsimonyLength,
+        i;
+
+       switch(states)
+         {
+         case 2:
+           {
+             parsimonyNumber
+               *left[2],
+               *right[2];
+             
+             for(k = 0; k < 2; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * 2 * pNumber) + width * k]);
+               }     
+             
+             for(i = 0; i < width; i += INTS_PER_VECTOR)
+               {                                               
+                 INT_TYPE      
+                   l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[0][i])), VECTOR_LOAD((CAST)(&right[0][i]))),
+                   l_C = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[1][i])), VECTOR_LOAD((CAST)(&right[1][i]))),            
+                   v_N = VECTOR_BIT_OR(l_A, l_C);
+                 
+                 v_N = VECTOR_AND_NOT(v_N, allOne);
+                 
+                 sum += vectorPopcount(v_N);
+                 
+                 if(sum >= bestScore)
+                   return sum;                         
+               }
+           }
+           break;
+         case 4:
+           {
+             parsimonyNumber
+               *left[4],
+               *right[4];
+      
+             for(k = 0; k < 4; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * 4 * pNumber) + width * k]);
+               }        
+
+             for(i = 0; i < width; i += INTS_PER_VECTOR)
+               {                                                
+                 INT_TYPE      
+                   l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[0][i])), VECTOR_LOAD((CAST)(&right[0][i]))),
+                   l_C = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[1][i])), VECTOR_LOAD((CAST)(&right[1][i]))),
+                   l_G = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[2][i])), VECTOR_LOAD((CAST)(&right[2][i]))),
+                   l_T = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[3][i])), VECTOR_LOAD((CAST)(&right[3][i]))),
+                   v_N = VECTOR_BIT_OR(VECTOR_BIT_OR(l_A, l_C), VECTOR_BIT_OR(l_G, l_T));     
+                 
+                 v_N = VECTOR_AND_NOT(v_N, allOne);
+                 
+                 sum += vectorPopcount(v_N);
+                 
+                 if(sum >= bestScore)            
+                   return sum;          
+               }                 
+           }
+           break;
+         case 20:
+           {
+             parsimonyNumber
+               *left[20],
+               *right[20];
+             
+              for(k = 0; k < 20; k++)
+                {
+                  left[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * qNumber) + width * k]);
+                  right[k] = &(pr->partitionData[model]->parsVect[(width * 20 * pNumber) + width * k]);
+                }  
+           
+              for(i = 0; i < width; i += INTS_PER_VECTOR)
+                {                              
+                  int 
+                    j;
+                  
+                  INT_TYPE      
+                    l_A,
+                    v_N = SET_ALL_BITS_ZERO;     
+                  
+                  for(j = 0; j < 20; j++)
+                    {
+                      l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[j][i])), VECTOR_LOAD((CAST)(&right[j][i])));
+                      v_N = VECTOR_BIT_OR(l_A, v_N);
+                    }
+                  
+                  v_N = VECTOR_AND_NOT(v_N, allOne);
+                  
+                  sum += vectorPopcount(v_N);          
+                  
+                  if(sum >= bestScore)      
+                    return sum;                        
+                }
+           }
+           break;
+         default:
+           {
+             parsimonyNumber
+               *left[32],  
+               *right[32]; 
+
+             assert(states <= 32);
+
+             for(k = 0; k < states; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * states * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * states * pNumber) + width * k]);
+               }  
+           
+             for(i = 0; i < width; i += INTS_PER_VECTOR)
+               {                               
+                 size_t
+                   j;
+                 
+                 INT_TYPE      
+                   l_A,
+                   v_N = SET_ALL_BITS_ZERO;     
+                 
+                 for(j = 0; j < states; j++)
+                   {
+                     l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[j][i])), VECTOR_LOAD((CAST)(&right[j][i])));
+                     v_N = VECTOR_BIT_OR(l_A, v_N);
+                   }
+                 
+                 v_N = VECTOR_AND_NOT(v_N, allOne);
+                 
+                 sum += vectorPopcount(v_N);           
+                 
+                 if(sum >= bestScore)         
+                   return sum;                 
+               }
+           }
+         }
+    }
+  
+  return sum;
+}
+
+
+#else
+static void newviewParsimonyIterativeFast(pllInstance *tr, partitionList * pr)
+{    
+  int 
+    model,
+    *ti = tr->ti,
+    count = ti[0],
+    index; 
+
+  for(index = 4; index < count; index += 4)
+    {      
+      unsigned int
+        totalScore = 0;
+
+      size_t
+        pNumber = (size_t)ti[index],
+        qNumber = (size_t)ti[index + 1],
+        rNumber = (size_t)ti[index + 2];
+      
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        {
+          size_t
+            k,
+            states = pr->partitionData[model]->states,
+            width = pr->partitionData[model]->parsimonyLength;    
+            
+          unsigned int  
+            i;      
+                 
+          switch(states)
+            {
+            case 2:       
+              {
+                parsimonyNumber
+                  *left[2],
+                  *right[2],
+                  *this[2];
+                
+                parsimonyNumber
+                   o_A,
+                   o_C,
+                   t_A,
+                   t_C, 
+                   t_N;
+                
+                for(k = 0; k < 2; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 2 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i++)
+                  {               
+                    t_A = left[0][i] & right[0][i];
+                    t_C = left[1][i] & right[1][i];                
+
+                    o_A = left[0][i] | right[0][i];
+                    o_C = left[1][i] | right[1][i];
+                  
+                    t_N = ~(t_A | t_C);   
+
+                    this[0][i] = t_A | (t_N & o_A);
+                    this[1][i] = t_C | (t_N & o_C);                
+                    
+                    totalScore += ((unsigned int) __builtin_popcount(t_N));
+                  }
+              }
+              break;
+            case 4:
+              {
+                parsimonyNumber
+                  *left[4],
+                  *right[4],
+                  *this[4];
+
+                for(k = 0; k < 4; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 4 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * pNumber) + width * k]);
+                  }
+
+                parsimonyNumber
+                   o_A,
+                   o_C,
+                   o_G,
+                   o_T,
+                   t_A,
+                   t_C,
+                   t_G,
+                   t_T, 
+                   t_N;
+
+                for(i = 0; i < width; i++)
+                  {               
+                    t_A = left[0][i] & right[0][i];
+                    t_C = left[1][i] & right[1][i];
+                    t_G = left[2][i] & right[2][i];       
+                    t_T = left[3][i] & right[3][i];
+
+                    o_A = left[0][i] | right[0][i];
+                    o_C = left[1][i] | right[1][i];
+                    o_G = left[2][i] | right[2][i];       
+                    o_T = left[3][i] | right[3][i];
+
+                    t_N = ~(t_A | t_C | t_G | t_T);       
+
+                    this[0][i] = t_A | (t_N & o_A);
+                    this[1][i] = t_C | (t_N & o_C);
+                    this[2][i] = t_G | (t_N & o_G);
+                    this[3][i] = t_T | (t_N & o_T); 
+                    
+                    totalScore += ((unsigned int) __builtin_popcount(t_N));
+                  }
+              }
+              break;
+            case 20:
+              {
+                parsimonyNumber
+                  *left[20],
+                  *right[20],
+                  *this[20];
+
+                parsimonyNumber
+                  o_A[20],
+                  t_A[20],        
+                  t_N;
+
+                for(k = 0; k < 20; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 20 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i++)
+                  {               
+                    size_t k;
+                    
+                    t_N = 0;
+
+                    for(k = 0; k < 20; k++)
+                      {
+                        t_A[k] = left[k][i] & right[k][i];
+                        o_A[k] = left[k][i] | right[k][i];
+                        t_N = t_N | t_A[k];
+                      }
+                    
+                    t_N = ~t_N;
+
+                    for(k = 0; k < 20; k++)                   
+                      this[k][i] = t_A[k] | (t_N & o_A[k]);                
+                    
+                    totalScore += ((unsigned int) __builtin_popcount(t_N));
+                  }
+              }
+              break;
+            default:
+              {         
+                parsimonyNumber
+                  *left[32],
+                  *right[32],
+                  *this[32];
+                
+                parsimonyNumber
+                  o_A[32],
+                  t_A[32],        
+                  t_N;
+                
+                assert(states <= 32);
+                
+                for(k = 0; k < states; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * states * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * states * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * states * pNumber) + width * k]);
+                  }
+                
+                for(i = 0; i < width; i++)
+                  {               
+                    t_N = 0;
+                    
+                    for(k = 0; k < states; k++)
+                      {
+                        t_A[k] = left[k][i] & right[k][i];
+                        o_A[k] = left[k][i] | right[k][i];
+                        t_N = t_N | t_A[k];
+                      }
+                    
+                    t_N = ~t_N;
+                    
+                    for(k = 0; k < states; k++)               
+                      this[k][i] = t_A[k] | (t_N & o_A[k]);                
+                    
+                    totalScore += ((unsigned int) __builtin_popcount(t_N));
+                  }
+              }                       
+            } 
+        }
+
+      tr->parsimonyScore[pNumber] = totalScore + tr->parsimonyScore[rNumber] + tr->parsimonyScore[qNumber];      
+    }
+}
+
+
+static unsigned int evaluateParsimonyIterativeFast(pllInstance *tr, partitionList * pr)
+{
+  size_t 
+    pNumber = (size_t)tr->ti[1],
+    qNumber = (size_t)tr->ti[2];
+
+  int
+    model;
+
+  unsigned int 
+    bestScore = tr->bestParsimony,    
+    sum;
+
+  if(tr->ti[0] > 4)
+    newviewParsimonyIterativeFast(tr, pr); 
+
+  sum = tr->parsimonyScore[pNumber] + tr->parsimonyScore[qNumber];
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      size_t
+        k,
+        states = pr->partitionData[model]->states,
+        width  = pr->partitionData[model]->parsimonyLength, 
+        i;
+
+       switch(states)
+         {
+         case 2:
+           {
+             parsimonyNumber 
+               t_A,
+               t_C,           
+               t_N,
+               *left[2],
+               *right[2];
+             
+             for(k = 0; k < 2; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * 2 * pNumber) + width * k]);
+               }     
+             
+             for(i = 0; i < width; i++)
+               {                                               
+                 t_A = left[0][i] & right[0][i];
+                 t_C = left[1][i] & right[1][i];
+                 
+                  t_N = ~(t_A | t_C);
+
+                  sum += ((unsigned int) __builtin_popcount(t_N));
+                 
+                 if(sum >= bestScore)
+                   return sum;                         
+               }
+           }
+           break;
+         case 4:
+           {
+             parsimonyNumber
+               t_A,
+               t_C,
+               t_G,
+               t_T,
+               t_N,
+               *left[4],
+               *right[4];
+      
+             for(k = 0; k < 4; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * 4 * pNumber) + width * k]);
+               }        
+
+             for(i = 0; i < width; i++)
+               {                                                
+                  t_A = left[0][i] & right[0][i];
+                  t_C = left[1][i] & right[1][i];
+                  t_G = left[2][i] & right[2][i];         
+                  t_T = left[3][i] & right[3][i];
+
+                  t_N = ~(t_A | t_C | t_G | t_T);
+
+                  sum += ((unsigned int) __builtin_popcount(t_N));
+                 
+                 if(sum >= bestScore)            
+                   return sum;          
+               }                 
+           }
+           break;
+         case 20:
+           {
+             parsimonyNumber
+               t_A,
+               t_N,
+               *left[20],
+               *right[20];
+             
+              for(k = 0; k < 20; k++)
+                {
+                  left[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * qNumber) + width * k]);
+                  right[k] = &(pr->partitionData[model]->parsVect[(width * 20 * pNumber) + width * k]);
+                }  
+           
+              for(i = 0; i < width; i++)
+                { 
+                  t_N = 0;
+                  
+                  for(k = 0; k < 20; k++)
+                    {
+                      t_A = left[k][i] & right[k][i];
+                      t_N = t_N | t_A;
+                    }
+               
+                  t_N = ~t_N;
+
+                  sum += ((unsigned int) __builtin_popcount(t_N));
+                  
+                  if(sum >= bestScore)      
+                    return sum;                        
+                }
+           }
+           break;
+         default:
+           {
+             parsimonyNumber
+               t_A,
+               t_N,
+               *left[32], 
+               *right[32];  
+
+             assert(states <= 32);
+
+             for(k = 0; k < states; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * states * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * states * pNumber) + width * k]);
+               }  
+           
+             for(i = 0; i < width; i++)
+               {                               
+                 t_N = 0;
+                  
+                 for(k = 0; k < states; k++)
+                   {
+                     t_A = left[k][i] & right[k][i];
+                     t_N = t_N | t_A;
+                   }
+               
+                  t_N = ~t_N;
+
+                  sum += ((unsigned int) __builtin_popcount(t_N));
+                                                 
+                 if(sum >= bestScore)                     
+                   return sum;                     
+               }                     
+           }
+         }
+    }
+  
+  return sum;
+}
+
+#endif
+
+
+
+
+
+
+static unsigned int evaluateParsimony(pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean full)
+{
+  volatile unsigned int result;
+  nodeptr q = p->back;
+  int
+    *ti = tr->ti,
+    counter = 4;
+  
+  ti[1] = p->number;
+  ti[2] = q->number;
+
+  if(full)
+    {
+      if(p->number > tr->mxtips)
+        computeTraversalInfoParsimony(p, ti, &counter, tr->mxtips, full);
+      if(q->number > tr->mxtips)
+        computeTraversalInfoParsimony(q, ti, &counter, tr->mxtips, full); 
+    }
+  else
+    {
+      if(p->number > tr->mxtips && !p->xPars)
+        computeTraversalInfoParsimony(p, ti, &counter, tr->mxtips, full);
+      if(q->number > tr->mxtips && !q->xPars)
+        computeTraversalInfoParsimony(q, ti, &counter, tr->mxtips, full); 
+    }
+
+  ti[0] = counter;
+
+  result = evaluateParsimonyIterativeFast(tr, pr);
+
+  return result;
+}
+
+
+static void newviewParsimony(pllInstance *tr, partitionList *pr, nodeptr  p)
+{     
+  if(p->number <= tr->mxtips)
+    return;
+
+  {
+    int 
+      counter = 4;     
+           
+    computeTraversalInfoParsimony(p, tr->ti, &counter, tr->mxtips, PLL_FALSE);              
+    tr->ti[0] = counter;            
+    
+    newviewParsimonyIterativeFast(tr, pr);
+  }
+}
+
+
+
+
+
+/****************************************************************************************************************************************/
+
+static void insertParsimony (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q)
+{
+  nodeptr  r;
+  
+  r = q->back;
+  
+  hookupDefault(p->next,       q);
+  hookupDefault(p->next->next, r);
+   
+  newviewParsimony(tr, pr, p);
+} 
+
+
+
+static nodeptr buildNewTip (pllInstance *tr, nodeptr p)
+{ 
+  nodeptr  q;
+
+  q = tr->nodep[(tr->nextnode)++];
+  hookupDefault(p, q);
+  q->next->back = (nodeptr)NULL;
+  q->next->next->back = (nodeptr)NULL;
+ 
+  return  q;
+} 
+
+static void buildSimpleTree (pllInstance *tr, partitionList *pr, int ip, int iq, int ir)
+{    
+  nodeptr  p, s;
+  int  i;
+  
+  i = PLL_MIN(ip, iq);
+  if (ir < i)  i = ir; 
+  tr->start = tr->nodep[i];
+  tr->ntips = 3;
+  p = tr->nodep[ip];
+  hookupDefault(p, tr->nodep[iq]);
+  s = buildNewTip(tr, tr->nodep[ir]);
+  insertParsimony(tr, pr, s, p);
+}
+
+
+static void testInsertParsimony (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q, pllBoolean saveBranches)
+{ 
+  unsigned int 
+    mp;
+ 
+  nodeptr  
+    r = q->back;   
+
+  pllBoolean
+    doIt = PLL_TRUE;
+
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  if(tr->grouped)
+    {
+      int 
+        rNumber = tr->constraintVector[r->number],
+        qNumber = tr->constraintVector[q->number],
+        pNumber = tr->constraintVector[p->number];
+
+      doIt = PLL_FALSE;
+     
+      if(pNumber == -9)
+        pNumber = checkerPars(tr, p->back);
+      if(pNumber == -9)
+        doIt = PLL_TRUE;
+      else
+        {
+          if(qNumber == -9)
+            qNumber = checkerPars(tr, q);
+
+          if(rNumber == -9)
+            rNumber = checkerPars(tr, r);
+
+          if(pNumber == rNumber || pNumber == qNumber)
+            doIt = PLL_TRUE;       
+        }
+    }
+
+  if(doIt)
+    {
+      double 
+        *z = rax_malloc(numBranches*sizeof(double));
+      
+      if(saveBranches)
+        {
+          int i;
+          
+          for(i = 0; i < numBranches; i++)
+            z[i] = q->z[i];
+        }
+
+      insertParsimony(tr, pr, p, q);
+  
+      mp = evaluateParsimony(tr, pr, p->next->next, PLL_FALSE);
+
+      if(mp < tr->bestParsimony)
+        {
+          tr->bestParsimony = mp;
+          tr->insertNode = q;
+          tr->removeNode = p;
+        }
+      
+      if(saveBranches)
+        hookup(q, r, z, numBranches);
+      else
+        hookupDefault(q, r);
+      
+      p->next->next->back = p->next->back = (nodeptr) NULL;
+      rax_free(z);
+    }
+       
+  return;
+} 
+
+
+static void restoreTreeParsimony(pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q)
+{ 
+  nodeptr
+    r = q->back;
+  
+  int counter = 4;
+  
+  hookupDefault(p->next,       q);
+  hookupDefault(p->next->next, r);
+  
+  computeTraversalInfoParsimony(p, tr->ti, &counter, tr->mxtips, PLL_FALSE);              
+  tr->ti[0] = counter;
+    
+  newviewParsimonyIterativeFast(tr, pr);
+}
+
+
+static void addTraverseParsimony (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q, int mintrav, int maxtrav, pllBoolean doAll, pllBoolean saveBranches)
+{        
+  if (doAll || (--mintrav <= 0))               
+    testInsertParsimony(tr, pr, p, q, saveBranches);
+
+  if (((q->number > tr->mxtips)) && ((--maxtrav > 0) || doAll))
+    {         
+      addTraverseParsimony(tr, pr, p, q->next->back, mintrav, maxtrav, doAll, saveBranches);
+      addTraverseParsimony(tr, pr, p, q->next->next->back, mintrav, maxtrav, doAll, saveBranches);
+    }
+}
+
+
+
+
+
+static void makePermutationFast(int *perm, int n, pllInstance *tr)
+{    
+  int  
+    i, 
+    j, 
+    k;
+
+  for (i = 1; i <= n; i++)    
+    perm[i] = i;               
+
+  for (i = 1; i <= n; i++) 
+    {      
+      double d =  randum(&tr->randomNumberSeed);
+
+      k =  (int)((double)(n + 1 - i) * d);
+      
+      j        = perm[i];
+
+      perm[i]     = perm[i + k];
+      perm[i + k] = j; 
+    }
+}
+
+//static nodeptr  removeNodeParsimony (nodeptr p, tree *tr)
+static nodeptr  removeNodeParsimony (nodeptr p)
+{ 
+  nodeptr  q, r;         
+
+  q = p->next->back;
+  r = p->next->next->back;   
+    
+  hookupDefault(q, r);
+
+  p->next->next->back = p->next->back = (node *) NULL;
+  
+  return  q;
+}
+
+static int rearrangeParsimony(pllInstance *tr, partitionList *pr, nodeptr p, int mintrav, int maxtrav, pllBoolean doAll)
+{   
+  nodeptr  
+    p1, 
+    p2, 
+    q, 
+    q1, 
+    q2;
+  
+  int      
+    mintrav2; 
+
+  pllBoolean
+    doP = PLL_TRUE,
+    doQ = PLL_TRUE;
+           
+  if (maxtrav > tr->ntips - 3)  
+    maxtrav = tr->ntips - 3; 
+
+  assert(mintrav == 1);
+
+  if(maxtrav < mintrav)
+    return 0;
+
+  q = p->back;
+
+  if(tr->constrained)
+    {    
+      if(! tipHomogeneityCheckerPars(tr, p->back, 0))
+        doP = PLL_FALSE;
+        
+      if(! tipHomogeneityCheckerPars(tr, q->back, 0))
+        doQ = PLL_FALSE;
+                        
+      if(doQ == PLL_FALSE && doP == PLL_FALSE)
+        return 0;
+    }  
+
+  if((p->number > tr->mxtips) && doP) 
+    {     
+      p1 = p->next->back;
+      p2 = p->next->next->back;
+      
+      if ((p1->number > tr->mxtips) || (p2->number > tr->mxtips)) 
+        {                 
+          //removeNodeParsimony(p, tr);          
+          removeNodeParsimony(p);                
+
+          if ((p1->number > tr->mxtips)) 
+            {
+              addTraverseParsimony(tr, pr, p, p1->next->back, mintrav, maxtrav, doAll, PLL_FALSE);
+              addTraverseParsimony(tr, pr, p, p1->next->next->back, mintrav, maxtrav, doAll, PLL_FALSE);
+            }
+         
+          if ((p2->number > tr->mxtips)) 
+            {
+              addTraverseParsimony(tr, pr, p, p2->next->back, mintrav, maxtrav, doAll, PLL_FALSE);
+              addTraverseParsimony(tr, pr, p, p2->next->next->back, mintrav, maxtrav, doAll, PLL_FALSE);
+            }
+            
+           
+          hookupDefault(p->next,       p1);
+          hookupDefault(p->next->next, p2);
+
+          newviewParsimony(tr, pr, p);
+        }
+    }  
+       
+  if ((q->number > tr->mxtips) && (maxtrav > 0) && doQ) 
+    {
+      q1 = q->next->back;
+      q2 = q->next->next->back;
+
+      if (
+          (
+           (q1->number > tr->mxtips) && 
+           ((q1->next->back->number > tr->mxtips) || (q1->next->next->back->number > tr->mxtips))
+           )
+          ||
+          (
+           (q2->number > tr->mxtips) && 
+           ((q2->next->back->number > tr->mxtips) || (q2->next->next->back->number > tr->mxtips))
+           )
+          )
+        {          
+
+          //removeNodeParsimony(q, tr);
+          removeNodeParsimony(q);
+          
+          mintrav2 = mintrav > 2 ? mintrav : 2;
+          
+          if ((q1->number > tr->mxtips)) 
+            {
+              addTraverseParsimony(tr, pr, q, q1->next->back, mintrav2 , maxtrav, doAll, PLL_FALSE);
+              addTraverseParsimony(tr, pr, q, q1->next->next->back, mintrav2 , maxtrav, doAll, PLL_FALSE);
+            }
+         
+          if ((q2->number > tr->mxtips)) 
+            {
+              addTraverseParsimony(tr, pr, q, q2->next->back, mintrav2 , maxtrav, doAll, PLL_FALSE);
+              addTraverseParsimony(tr, pr, q, q2->next->next->back, mintrav2 , maxtrav, doAll, PLL_FALSE);
+            }      
+           
+          hookupDefault(q->next,       q1);
+          hookupDefault(q->next->next, q2);
+           
+          newviewParsimony(tr, pr, q);
+        }
+    }
+
+  return 1;
+} 
+
+
+static void restoreTreeRearrangeParsimony(pllInstance *tr, partitionList *pr)
+{    
+  removeNodeParsimony(tr->removeNode);  
+  //removeNodeParsimony(tr->removeNode, tr);  
+  restoreTreeParsimony(tr, pr, tr->removeNode, tr->insertNode);
+}
+
+/*
+static pllBoolean isInformative2(pllInstance *tr, int site)
+{
+  int
+    informativeCounter = 0,
+    check[256],   
+    j,   
+    undetermined = 15;
+
+  unsigned char
+    nucleotide,
+    target = 0;
+        
+  for(j = 0; j < 256; j++)
+    check[j] = 0;
+  
+  for(j = 1; j <= tr->mxtips; j++)
+    {      
+      nucleotide = tr->yVector[j][site];            
+      check[nucleotide] =  check[nucleotide] + 1;                  
+    }
+  
+  
+  if(check[1] > 1)
+    {
+      informativeCounter++;    
+      target = target | 1;
+    }
+  if(check[2] > 1)
+    {
+      informativeCounter++; 
+      target = target | 2;
+    }
+  if(check[4] > 1)
+    {
+      informativeCounter++; 
+      target = target | 4;
+    }
+  if(check[8] > 1)
+    {
+      informativeCounter++; 
+      target = target | 8;
+    }
+          
+  if(informativeCounter >= 2)
+    return PLL_TRUE;    
+  else
+    {        
+      for(j = 0; j < undetermined; j++)
+        {
+          if(j == 3 || j == 5 || j == 6 || j == 7 || j == 9 || j == 10 || j == 11 || 
+             j == 12 || j == 13 || j == 14)
+            {
+              if(check[j] > 1)
+                {
+                  if(!(target & j))
+                    return PLL_TRUE;
+                }
+            }
+        } 
+    }
+     
+  return PLL_FALSE;          
+}
+*/
+
+static pllBoolean isInformative(pllInstance *tr, int dataType, int site)
+{
+  int
+    informativeCounter = 0,
+    check[256],   
+    j,   
+    undetermined = getUndetermined(dataType);
+
+  const unsigned int
+    *bitVector = getBitVector(dataType);
+
+  unsigned char
+    nucleotide;
+  
+        
+  for(j = 0; j < 256; j++)
+    check[j] = 0;
+  
+  for(j = 1; j <= tr->mxtips; j++)
+    {      
+      nucleotide = tr->yVector[j][site];            
+      check[nucleotide] =  check[nucleotide] + 1;
+      assert(bitVector[nucleotide] > 0);                   
+    }
+  
+  for(j = 0; j < undetermined; j++)
+    {
+      if(check[j] > 0)
+        informativeCounter++;    
+    } 
+          
+  if(informativeCounter <= 1)
+    return PLL_FALSE;    
+  else
+    {        
+      for(j = 0; j < undetermined; j++)
+        {
+          if(check[j] > 1)
+            return PLL_TRUE;
+        } 
+    }
+     
+  return PLL_FALSE;          
+}
+
+
+static void determineUninformativeSites(pllInstance *tr, partitionList *pr, int *informative)
+{
+  int 
+    model,
+    number = 0,
+    i;
+
+  /* 
+     Not all characters are useful in constructing a parsimony tree. 
+     Invariant characters, those that have the same state in all taxa, 
+     are obviously useless and are ignored by the method. Characters in 
+     which a state occurs in only one taxon are also ignored. 
+     All these characters are called parsimony uninformative.
+
+     Alternative definition: informative columns contain at least two types
+     of nucleotides, and each nucleotide must appear at least twice in each 
+     column. Kind of a pain if we intend to check for this when using, e.g.,
+     amibiguous DNA encoding.
+  */
+
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      for(i = pr->partitionData[model]->lower; i < pr->partitionData[model]->upper; i++)
+        {
+           if(isInformative(tr, pr->partitionData[model]->dataType, i))
+             informative[i] = 1;
+           else
+             {
+               informative[i] = 0;
+               number++;
+             }  
+        }      
+    }
+
+ 
+  /* printf("Uninformative Patterns: %d\n", number); */
+}
+
+
+static void reorderNodes(pllInstance *tr, nodeptr *np, nodeptr p, int *count)
+{
+  int i, found = 0;
+
+  if((p->number <= tr->mxtips))    
+    return;
+  else
+    {              
+      for(i = tr->mxtips + 1; (i <= (tr->mxtips + tr->mxtips - 1)) && (found == 0); i++)
+        {
+          if (p == np[i] || p == np[i]->next || p == np[i]->next->next)
+            {
+              if(p == np[i])                           
+                tr->nodep[*count + tr->mxtips + 1] = np[i];                             
+              else
+                {
+                  if(p == np[i]->next)            
+                    tr->nodep[*count + tr->mxtips + 1] = np[i]->next;                      
+                  else             
+                    tr->nodep[*count + tr->mxtips + 1] = np[i]->next->next;                                 
+                }
+
+              found = 1;                     
+              *count = *count + 1;
+            }
+        }            
+     
+      assert(found != 0);
+
+      reorderNodes(tr, np, p->next->back, count);     
+      reorderNodes(tr, np, p->next->next->back, count);                
+    }
+}
+
+
+
+static void nodeRectifierPars(pllInstance *tr)
+{
+  nodeptr *np = (nodeptr *)rax_malloc(2 * tr->mxtips * sizeof(nodeptr));
+  int i;
+  int count = 0;
+  
+  tr->start       = tr->nodep[1];
+  tr->rooted      = PLL_FALSE;
+
+  /* TODO why is tr->rooted set to PLL_FALSE here ?*/
+  
+  for(i = tr->mxtips + 1; i <= (tr->mxtips + tr->mxtips - 1); i++)
+    np[i] = tr->nodep[i];           
+  
+  reorderNodes(tr, np, tr->start->back, &count); 
+
+ 
+  rax_free(np);
+}
+
+
+  
+static void compressDNA(pllInstance *tr, partitionList *pr, int *informative)
+{
+  size_t
+    totalNodes,
+    i,
+    model;
+   
+  totalNodes = 2 * (size_t)tr->mxtips;
+
+ 
+
+  for(model = 0; model < (size_t) pr->numberOfPartitions; model++)
+    {
+      size_t
+        k,
+        states = (size_t)pr->partitionData[model]->states,
+        compressedEntries,
+        compressedEntriesPadded,
+        entries = 0, 
+        lower = pr->partitionData[model]->lower,
+        upper = pr->partitionData[model]->upper;
+
+      parsimonyNumber 
+        **compressedTips = (parsimonyNumber **)rax_malloc(states * sizeof(parsimonyNumber*)),
+        *compressedValues = (parsimonyNumber *)rax_malloc(states * sizeof(parsimonyNumber));
+      
+      for(i = lower; i < upper; i++)    
+        if(informative[i])
+          entries += (size_t)tr->aliaswgt[i];     
+  
+      compressedEntries = entries / PLL_PCF;
+
+      if(entries % PLL_PCF != 0)
+        compressedEntries++;
+
+#if (defined(__SSE3) || defined(__AVX))
+      if(compressedEntries % INTS_PER_VECTOR != 0)
+        compressedEntriesPadded = compressedEntries + (INTS_PER_VECTOR - (compressedEntries % INTS_PER_VECTOR));
+      else
+        compressedEntriesPadded = compressedEntries;
+#else
+      compressedEntriesPadded = compressedEntries;
+#endif     
+
+      
+      rax_posix_memalign ((void **) &(pr->partitionData[model]->parsVect), PLL_BYTE_ALIGNMENT, (size_t)compressedEntriesPadded * states * totalNodes * sizeof(parsimonyNumber));
+     
+      for(i = 0; i < compressedEntriesPadded * states * totalNodes; i++)      
+        pr->partitionData[model]->parsVect[i] = 0;
+
+      for(i = 0; i < (size_t)tr->mxtips; i++)
+        {
+          size_t
+            w = 0,
+            compressedIndex = 0,
+            compressedCounter = 0,
+            index = 0;
+
+          for(k = 0; k < states; k++)
+            {
+              compressedTips[k] = &(pr->partitionData[model]->parsVect[(compressedEntriesPadded * states * (i + 1)) + (compressedEntriesPadded * k)]);
+              compressedValues[k] = 0;
+            }                
+              
+          for(index = lower; index < (size_t)upper; index++)
+            {
+              if(informative[index])
+                {
+                  const unsigned int 
+                    *bitValue = getBitVector(pr->partitionData[model]->dataType);
+
+                  parsimonyNumber 
+                    value = bitValue[tr->yVector[i + 1][index]];          
+              
+                  for(w = 0; w < (size_t)tr->aliaswgt[index]; w++)
+                    {      
+                      for(k = 0; k < states; k++)
+                        {
+                          if(value & mask32[k])
+                            compressedValues[k] |= mask32[compressedCounter];
+                        }
+                     
+                      compressedCounter++;
+                  
+                      if(compressedCounter == PLL_PCF)
+                        {
+                          for(k = 0; k < states; k++)
+                            {
+                              compressedTips[k][compressedIndex] = compressedValues[k];
+                              compressedValues[k] = 0;
+                            }                    
+                          
+                          compressedCounter = 0;
+                          compressedIndex++;
+                        }
+                    }
+                }
+            }
+                           
+          for(;compressedIndex < compressedEntriesPadded; compressedIndex++)
+            {   
+              for(;compressedCounter < PLL_PCF; compressedCounter++)              
+                for(k = 0; k < states; k++)
+                  compressedValues[k] |= mask32[compressedCounter];               
+          
+              for(k = 0; k < states; k++)
+                {
+                  compressedTips[k][compressedIndex] = compressedValues[k];
+                  compressedValues[k] = 0;
+                }                     
+              
+              compressedCounter = 0;
+            }           
+        }               
+  
+      pr->partitionData[model]->parsimonyLength = compressedEntriesPadded;
+
+      rax_free(compressedTips);
+      rax_free(compressedValues);
+    }
+  
+  rax_posix_memalign ((void **) &(tr->parsimonyScore), PLL_BYTE_ALIGNMENT, sizeof(unsigned int) * totalNodes);  
+          
+  for(i = 0; i < totalNodes; i++) 
+    tr->parsimonyScore[i] = 0;
+}
+
+
+
+static void stepwiseAddition(pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q)
+{            
+  nodeptr 
+    r = q->back;
+
+  unsigned int 
+    mp;
+  
+  int 
+    counter = 4;
+  
+  p->next->back = q;
+  q->back = p->next;
+
+  p->next->next->back = r;
+  r->back = p->next->next;
+   
+  computeTraversalInfoParsimony(p, tr->ti, &counter, tr->mxtips, PLL_FALSE);              
+  tr->ti[0] = counter;
+  tr->ti[1] = p->number;
+  tr->ti[2] = p->back->number;
+    
+  mp = evaluateParsimonyIterativeFast(tr, pr);
+  
+  if(mp < tr->bestParsimony)
+    {    
+      tr->bestParsimony = mp;
+      tr->insertNode = q;     
+    }
+ 
+  q->back = r;
+  r->back = q;
+   
+  if(q->number > tr->mxtips && tr->parsimonyScore[q->number] > 0)
+    {         
+      stepwiseAddition(tr, pr, p, q->next->back);
+      stepwiseAddition(tr, pr, p, q->next->next->back);
+    }
+}
+
+
+
+void allocateParsimonyDataStructures(pllInstance *tr, partitionList *pr)
+{
+  int 
+    i,
+    *informative = (int *)rax_malloc(sizeof(int) * (size_t)tr->originalCrunchedLength);
+ 
+  determineUninformativeSites(tr, pr, informative);
+
+  compressDNA(tr, pr, informative);
+
+  for(i = tr->mxtips + 1; i <= tr->mxtips + tr->mxtips - 1; i++)
+    {
+      nodeptr 
+        p = tr->nodep[i];
+
+      p->xPars = 1;
+      p->next->xPars = 0;
+      p->next->next->xPars = 0;
+    }
+
+  tr->ti = (int*)rax_malloc(sizeof(int) * 4 * (size_t)tr->mxtips);  
+
+  rax_free(informative); 
+}
+
+void pllFreeParsimonyDataStructures(pllInstance *tr, partitionList *pr)
+{
+  size_t 
+    model;
+
+  rax_free(tr->parsimonyScore);
+  
+  for(model = 0; model < (size_t) pr->numberOfPartitions; ++model)
+    rax_free(pr->partitionData[model]->parsVect);
+  
+  rax_free(tr->ti);
+}
+
+
+void pllMakeParsimonyTreeFast(pllInstance *tr, partitionList *pr, int sprDist)
+{   
+  nodeptr  
+    p, 
+    f;    
+
+  int 
+    i, 
+    nextsp,
+    *perm        = (int *)rax_malloc((size_t)(tr->mxtips + 1) * sizeof(int));  
+
+  unsigned int 
+    randomMP, 
+    startMP;         
+  
+  assert(!tr->constrained);
+
+  makePermutationFast(perm, tr->mxtips, tr);
+  
+  tr->ntips = 0;    
+  
+  tr->nextnode = tr->mxtips + 1;       
+  
+  buildSimpleTree(tr, pr, perm[1], perm[2], perm[3]);
+  
+  f = tr->start;       
+  
+  while(tr->ntips < tr->mxtips) 
+    {   
+      nodeptr q;
+      
+      tr->bestParsimony = INT_MAX;
+      nextsp = ++(tr->ntips);             
+      p = tr->nodep[perm[nextsp]];                 
+      q = tr->nodep[(tr->nextnode)++];
+      p->back = q;
+      q->back = p;
+        
+      if(tr->grouped)
+        {
+          int 
+            number = p->back->number;            
+
+          tr->constraintVector[number] = -9;
+        }
+          
+      stepwiseAddition(tr, pr, q, f->back);
+      
+      {
+        nodeptr   
+          r = tr->insertNode->back;
+        
+        int counter = 4;
+        
+        hookupDefault(q->next,       tr->insertNode);
+        hookupDefault(q->next->next, r);
+        
+        computeTraversalInfoParsimony(q, tr->ti, &counter, tr->mxtips, PLL_FALSE);              
+        tr->ti[0] = counter;
+        
+        newviewParsimonyIterativeFast(tr, pr);
+      }
+    }    
+  
+  nodeRectifierPars(tr);
+  
+  randomMP = tr->bestParsimony;        
+  
+  do
+    {
+      startMP = randomMP;
+      nodeRectifierPars(tr);
+      for(i = 1; i <= tr->mxtips + tr->mxtips - 2; i++)
+        {
+          rearrangeParsimony(tr, pr, tr->nodep[i], 1, sprDist, PLL_FALSE);
+          if(tr->bestParsimony < randomMP)
+            {           
+              restoreTreeRearrangeParsimony(tr, pr);
+              randomMP = tr->bestParsimony;
+            }
+        }                          
+    }
+  while(randomMP < startMP);
+  
+  rax_free(perm);
+} 
diff --git a/pll/genericParallelization.c b/pll/genericParallelization.c
new file mode 100644
index 0000000..1454b5e
--- /dev/null
+++ b/pll/genericParallelization.c
@@ -0,0 +1,2283 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file genericParallelization.c
+ */
+#include "mem_alloc.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <limits.h>
+
+#ifdef MEASURE_TIME_PARALLEL
+#include <time.h>
+#endif
+
+#include <assert.h>
+
+#include "genericParallelization.h"
+#include "pllInternal.h"
+#include "pll.h"
+
+/** @file genericParallelization.c
+    
+    @brief Generic master-worker parallelization with either pthreads or MPI. 
+    
+    Worker threads/processes mostly work on a local
+    tree. Implementationwise, MPI operations are abstracted as good as
+    possible via defines (that translate to no-ops or memcpy-calls in
+    the pthreads version).
+
+    @todo the code still contains many memory copy operations that
+    could be executed more efficiently in-place  
+*/
+
+
+
+void perSiteLogLikelihoodsPthreads(pllInstance *tr, partitionList *pr, double *lhs, int n, int tid);
+void broadcastAfterRateOpt(pllInstance *tr, pllInstance *localTree, partitionList *pr, int n, int tid);
+void branchLength_parallelReduce(pllInstance *tr, double *dlnLdlz,  double *d2lnLdlz2, int numBranches );
+void pllMasterPostBarrier(pllInstance *tr, partitionList *pr, int jobType);
+static void distributeYVectors(pllInstance *localTree, pllInstance *tr, partitionList *localPr);
+static void distributeWeights(pllInstance *localTree, pllInstance *tr, partitionList *localPr);
+static pllBoolean execFunction(pllInstance *tr, pllInstance *localTree, partitionList *pr, partitionList *localPr, int tid, int n);
+
+static void *likelihoodThread(void *tData); 
+
+static void multiprocessorScheduling(pllInstance * tr, partitionList *pr, int tid);
+
+static void computeFraction(partitionList *localPr, int tid, int n);
+static void computeFractionMany(partitionList *localPr, int tid);
+static void initializePartitionsMaster(pllInstance *tr, pllInstance *localTree, partitionList *pr, partitionList *localPr, int tid, int n);
+
+#ifdef _FINE_GRAIN_MPI
+static char* addBytes(char *buf, void *toAdd, size_t numBytes); 
+static char* popBytes(char *buf, void *result, size_t numBytes); 
+static void defineTraversalInfoMPI(void);
+static pllBoolean pllWorkerTrap(pllInstance *tr, partitionList *pr);
+#endif
+
+#ifdef _USE_PTHREADS
+static pthread_t *threads;
+static threadData *tData;
+#endif
+
+extern volatile int jobCycle; 
+extern volatile int threadJob;          /**< current job to be done by worker threads/processes */
+extern pllBoolean treeIsInitialized; 
+
+#ifdef MEASURE_TIME_PARALLEL
+extern double masterTimePerPhase; 
+double timeBuffer[NUM_PAR_JOBS]; 
+double timePerRegion[NUM_PAR_JOBS]; 
+#endif
+
+extern char* getJobName(int tmp); 
+
+//extern double *globalResult; 
+extern volatile char *barrierBuffer;
+
+
+#ifdef _FINE_GRAIN_MPI
+extern MPI_Datatype TRAVERSAL_MPI; 
+
+/** @brief Pthreads helper function for adding bytes to communication buffer.
+
+    Copy from \toAdd to \a buf \a numBytes bytes
+
+    @param buf
+      Where to place bytes
+
+    @pram toAdd
+      Where to copy them from
+
+    @para numBytes
+      How many to copy
+
+    @return
+      Pointer to the end of placed data in communication buffer (first free slot)
+ */ 
+static char* addBytes(char *buf, void *toAdd, size_t numBytes)
+{
+  memcpy(buf, toAdd, numBytes);  
+  return buf + numBytes;  
+}
+
+/** @brief Pthreads helper function for removing bytes from communication buffer
+    
+    Copies \a numBytes from communication buffer \a buf to some local buffer \a buf
+
+    @param buf
+      Where to store the bytes
+
+    @param result
+      Where to copy from
+
+    @param numBytes
+      How many to copy
+    
+    @return
+      Pointer to the end of read data in communication buffer (first free slot)
+ */ 
+static char* popBytes(char *buf, void *result, size_t numBytes)
+{
+  memcpy(result, buf, numBytes); 
+  return buf + numBytes;   
+}
+
+/** @brief Lock the MPI slave processes prior allocating partitions
+
+    MPI slave processes are locked and wait until the master process
+    has read the number of partitions, which it then broadcasts
+    to slaves, effectively unlocking them. The slave processes will
+    then allocate their own data structures and be locked in the
+    likelihood function.
+
+    @param tr
+      PLL instance
+    
+    @todo
+      This function should not be called by the user. It is called
+      at \a pllCreateInstance. Probably this function should be removed
+      and inline code be placed in \a pllCreateInstance.
+*/
+void pllLockMPI (pllInstance * tr)
+{
+  int numberOfPartitions;
+  partitionList * pr;
+
+  if (!MASTER_P) 
+   {
+     //MPI_Bcast (&numberOfPartitions, 1, MPI_INT, MPI_ROOT, MPI_COMM_WORLD);
+     MPI_Bcast (&numberOfPartitions, 1, MPI_INT, 0, MPI_COMM_WORLD);
+     pr = (partitionList *) rax_calloc (1, sizeof (partitionList));
+     pr->numberOfPartitions = numberOfPartitions;
+
+     pllWorkerTrap (tr, pr);
+     MPI_Barrier (MPI_COMM_WORLD);
+     MPI_Finalize ();
+     exit(0);
+   }
+}
+
+/** Finalize MPI run
+
+    Finalizes MPI run by synchronizing all processes (master + slaves) with a
+    barrier so that all free their allocated resources. Then \a MPI_Finalize ()
+    is called.
+
+    @todo
+      Similarly as with the \a pllLockMPI function, this should not be called
+      by the user, but it is called implicitly at the end of \a pllDestroyInstance.
+      Probably this function should be removed and inline code be placed in
+      \a pllDestroyInstance.
+*/
+void pllFinalizeMPI (void)
+{
+  MPI_Barrier (MPI_COMM_WORLD);
+  MPI_Finalize ();
+}
+
+/**
+   @brief Sets up the MPI environment.  
+
+   Calls the \a MPI_Init function and makes sure all processes store
+   their process ID and the total number of processes, using a barrier.
+   
+   @note this should be the first call that is executed in your main
+   method.
+   
+   @param argc   
+     Address of argc from main
+   @param argv   
+     Address of argv from main
+ */
+void pllInitMPI(int * argc, char **argv[])
+{  
+  MPI_Init(argc, argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &processID);
+  MPI_Comm_size(MPI_COMM_WORLD, &processes);
+
+  /* if(MASTER_P) */
+  /*   printf("\nThis is RAxML Process Number: %d (MASTER)\n", processID); */
+  MPI_Barrier(MPI_COMM_WORLD);
+
+}
+
+
+/**
+   @brief Traps worker MPI processes.    
+   
+   @note  This function should be called immediately after initMPI()
+
+   @param tr 
+     PLL instance 
+
+   @param pr
+     List of partitions
+
+   @return
+     Returns /b PLL_FALSE if the callee was the master thread/process, otherwise /b PLL_TRUE
+ */ 
+static pllBoolean pllWorkerTrap(pllInstance *tr, partitionList *pr)
+{
+  /// @note for the broadcasting, we need to, if the tree structure has already been initialized 
+  treeIsInitialized = PLL_FALSE; 
+
+  if(NOT MASTER_P) 
+    {
+      threadData tData; 
+      tData.tr = tr; 
+      tData.threadNumber = processID;
+      tData.pr = pr;
+      
+      likelihoodThread(&tData);
+
+      /* notice: the next call MUST be the return call from the main method */
+      return PLL_TRUE; 
+    }
+  return PLL_FALSE; 
+}
+
+
+#define ELEMS_IN_TRAV_INFO  9
+/** @brief Create a datastructure for sending the traversal descriptor.
+    
+    @note This seems to be a very safe method to define your own mpi
+   datatypes (often there are problems with padding). But it is not
+   entirely for the weak of heart...
+ */ 
+static void defineTraversalInfoMPI (void)
+{
+  MPI_Datatype *result  = &TRAVERSAL_MPI; 
+
+  int i ; 
+  MPI_Aint base; 
+  int blocklen[ELEMS_IN_TRAV_INFO+1] = {1, 1, 1, 1, PLL_NUM_BRANCHES, PLL_NUM_BRANCHES, 1,1,1,1}; 
+  MPI_Aint disp[ELEMS_IN_TRAV_INFO+1];
+  MPI_Datatype type[ELEMS_IN_TRAV_INFO+1] = {MPI_INT, MPI_INT, MPI_INT, MPI_INT, MPI_DOUBLE, MPI_DOUBLE, MPI_INT, MPI_INT, MPI_INT, MPI_UB}; 
+  traversalInfo desc[2]; 
+
+  MPI_Get_address( desc, disp);
+  MPI_Get_address( &(desc[0].pNumber), disp + 1 );
+  MPI_Get_address( &(desc[0].qNumber), disp + 2 );  
+  MPI_Get_address( &(desc[0].rNumber), disp + 3); 
+  MPI_Get_address( desc[0].qz, disp + 4 );
+  MPI_Get_address( desc[0].rz, disp + 5 );
+  MPI_Get_address( &(desc[0].slot_p), disp + 6);
+  MPI_Get_address( &(desc[0].slot_q), disp + 7);
+  MPI_Get_address( &(desc[0].slot_r), disp + 8);
+  MPI_Get_address( desc + 1, disp + 9);
+
+  base = disp[0]; 
+  for(i = 0; i < ELEMS_IN_TRAV_INFO+1; ++i)
+    disp[i] -= base;
+
+  MPI_Type_create_struct( ELEMS_IN_TRAV_INFO+1 , blocklen, disp, type, result);
+  MPI_Type_commit(result);
+}
+
+
+#endif
+
+
+/********************/
+/* PTHREAD-SPECIFIC */
+/********************/
+#ifdef _USE_PTHREADS
+
+#ifndef _PORTABLE_PTHREADS
+/** @brief Pins a thread to a core (for efficiency). 
+
+    This is a non-portable function that works only on some linux distributions of pthreads.
+    It sets the affinity of each thread to a specific core so that the performance is not
+    degraded due to threads migration.
+
+    @note 
+      It is only called if \a _PORTABLE_PTHREADS is not defined
+
+    @param tid the thread id
+ */ 
+void pinToCore(int tid)
+{
+  static int nextCore = 0;
+
+  cpu_set_t cpuset;
+
+  CPU_ZERO(&cpuset);    
+  CPU_SET(nextCore++, &cpuset);
+
+  if(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) != 0)
+    {
+      assert(0);
+    }
+}
+#endif
+
+/**  Start PThreads
+
+     Start JOINABLE threads by executing \a pthread_create. The threads
+     are attached to the \a pllLikelihoodThread function
+
+     @param tr
+       PLL instance
+
+     @param pr
+       List of partitions
+
+     @todo
+       This function should never be called by the user. It is called
+       implicitly at \a pllInitModel. Perhaps we should add a check
+       or inline the code
+ */ 
+void pllStartPthreads (pllInstance *tr, partitionList *pr)
+{
+  pthread_attr_t attr;
+  int rc, t;
+  treeIsInitialized = PLL_FALSE; 
+
+  jobCycle        = 0;
+  threadJob       = 0;
+
+  /* printf("\nThis is the RAxML Master Pthread\n");   */
+
+#if (NOT defined(_USE_PTHREADS) && defined( MEASURE_TIME_PARALLEL))
+  timeBuffer = rax_calloc(NUM_PAR_JOBS * tr->numberOfThreads, sizeof(double)); 
+#endif
+
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+  threads    = (pthread_t *)rax_malloc((size_t)tr->numberOfThreads * sizeof(pthread_t));
+  tData      = (threadData *)rax_malloc((size_t)tr->numberOfThreads * sizeof(threadData));
+
+  barrierBuffer            = (volatile char *)  rax_malloc(sizeof(volatile char)   *  (size_t)tr->numberOfThreads);
+
+  for(t = 0; t < tr->numberOfThreads; t++)
+    barrierBuffer[t] = 0;
+
+  for(t = 1; t < tr->numberOfThreads; t++)
+    {
+      tData[t].tr  = tr;
+      tData[t].pr  = pr;
+      tData[t].threadNumber = t;
+      rc = pthread_create(&threads[t], &attr, likelihoodThread, (void *)(&tData[t]));
+      if(rc)
+	{
+	  printf("ERROR; return code from pthread_create() is %d\n", rc);
+	  exit(-1);
+	}
+    }
+  pthread_attr_destroy (&attr);
+}
+
+/** Stop PThread
+    
+    Stop threads by \a pthread_join
+
+    @param  tr
+      PLL instance
+
+    @todo
+      This function should never be called by the user. It is implicitly called
+      at \a pllPartitionsDestroy. We should inline the code
+*/
+void pllStopPthreads (pllInstance * tr)
+{
+  int i;
+
+  for (i = 1; i < tr->numberOfThreads; ++ i)
+   {
+     pthread_join (threads[i], NULL);
+   }
+ 
+  rax_free (threads);
+  rax_free (tData);
+  rax_free ((void *)barrierBuffer);
+  rax_free (globalResult);
+
+}
+#endif
+
+
+/** Compute per-site log likelihoods (PThreads version) 
+
+    Worker threads evaluate the likelihood on their sites
+
+    @param tr 
+      Tree instance
+
+    @param lhs
+      Likelihood array
+
+    @param n
+      Number of threads
+
+    @param tid
+      Thread id
+ */ 
+void perSiteLogLikelihoodsPthreads(pllInstance *tr, partitionList *pr, double *lhs, int n, int tid)
+{
+  size_t 
+    model, 
+    i;
+
+  for(model = 0; model < (size_t)pr->numberOfPartitions; model++)
+    {      
+      size_t 
+	localIndex = 0;
+
+      /* decide if this partition is handled by the thread when -Q is ativated 
+	 or when -Q is not activated figure out which sites have been assigned to the 
+	 current thread */
+
+      pllBoolean 
+	execute = ((tr->manyPartitions && isThisMyPartition(pr, tid, model)) || (!tr->manyPartitions));
+
+      /* if the entire partition has been assigned to this thread (-Q) or if -Q is not activated 
+	 we need to compute some per-site log likelihoods with thread tid for this partition */
+
+      if(execute)
+	for(i = (size_t)(pr->partitionData[model]->lower);  i < (size_t)(pr->partitionData[model]->upper); i++)
+	  {
+	    /* if -Q is active we compute all per-site log likelihoods for the partition,
+	       othwerise we only compute those that have been assigned to thread tid 
+	       using the cyclic distribution scheme */
+
+	    if(tr->manyPartitions || (i % n == (size_t)tid))
+	      {
+		double 
+		  l;
+
+		/* now compute the per-site log likelihood at the current site */
+
+		switch(tr->rateHetModel)
+		  {
+		  case PLL_CAT:
+		    l = evaluatePartialGeneric (tr, pr, localIndex, pr->partitionData[model]->perSiteRates[pr->partitionData[model]->rateCategory[localIndex]], model);
+		    break;
+		  case PLL_GAMMA:
+		    l = evaluatePartialGeneric (tr, pr, localIndex, 1.0, model);
+		    break;
+		  default:
+		    assert(0);
+		  }
+
+		/* store it in an array that is local in memory to the current thread,
+		   see function collectDouble() in axml.c for understanding how we then collect these 
+		   values stored in local arrays from the threads */
+
+		lhs[i] = l;
+
+		localIndex++;
+	      }
+	  }
+    }
+}
+
+/** @brief Check if a partition is assign to a thread/process.
+
+    Checks whether partition \a model from partition list \a localPr is
+    assigned to be processed by process/thread with id \a tid.
+
+    @param localTree
+      Local PLL instance
+
+    @param tid 
+      Thread/Process id
+
+    @param model
+      Partition number
+ */ 
+pllBoolean isThisMyPartition(partitionList *localPr, int tid, int model)
+{ 
+  if(localPr->partitionData[model]->partitionAssignment == tid)
+    return PLL_TRUE;
+  else
+    return PLL_FALSE;
+}
+
+/** @brief Computes partition size for all partitions (in case full partitions are assigns to workers). 
+
+    @param localPr the local partitions instance
+    
+    @param tid thread id    
+ */ 
+static void computeFractionMany(partitionList *localPr, int tid)
+{
+  int
+    sites = 0;
+
+  int   
+    model;
+
+  for(model = 0; model < localPr->numberOfPartitions; model++)
+    {
+      if(isThisMyPartition(localPr, tid, model))
+	{	 
+    	  localPr->partitionData[model]->width = localPr->partitionData[model]->upper - localPr->partitionData[model]->lower;
+	  sites += localPr->partitionData[model]->width;
+	}
+      else       	  
+    	  localPr->partitionData[model]->width = 0;
+    }
+
+
+}
+
+
+/** @brief Computes partition size for all partitions (for cyclic distribution of sites)
+    
+    @param localPr the local partitions instance
+    @param tid thread id
+    @param n number of workers
+ */ 
+static void computeFraction(partitionList *localPr, int tid, int n)
+{
+  int
+    i,
+    model;
+
+  for(model = 0; model < localPr->numberOfPartitions; model++)
+    {
+      int width = 0;
+
+      for(i = localPr->partitionData[model]->lower; i < localPr->partitionData[model]->upper; i++)
+	if(i % n == tid)
+	  width++;
+      localPr->partitionData[model]->width = width;
+    }
+}
+
+
+
+/** @brief Compare partition sizes. 
+    @param p1 pointer to a partition
+    @param p2 pointer to another partition
+ */ 
+static int partCompare(const void *p1, const void *p2)
+{
+  partitionType 
+    *rc1 = (partitionType *)p1,
+    *rc2 = (partitionType *)p2;
+
+  int 
+    i = rc1->partitionLength,
+    j = rc2->partitionLength;
+
+  if (i > j)
+    return (-1);
+  if (i < j)
+    return (1);
+  return (0);
+}
+
+
+/** @brief Top-level function for the multi processor scheduling
+    scheme (assigns full partitions to workers).
+    
+   tr->manyPartitions is set to PLL_TRUE if the user has indicated via -Q
+   that there are substantially more partitions than threads/cores
+   available. In that case we do not distribute sites from each
+   partition in a cyclic fashion to the cores , but distribute entire
+   partitions to cores.  Achieving a good balance of alignment sites
+   to cores boils down to the multi-processor scheduling problem known
+   from theoretical comp. sci.  which is NP-complete.  We have
+   implemented very simple "standard" heuristics for solving the
+   multiprocessor scheduling problem that turn out to work very well
+   and are cheap to compute.
+   
+   @param pr 
+     List of partitions
+
+   @param tid
+     Id of current process/thread 
+*/
+static void multiprocessorScheduling(pllInstance * tr, partitionList *pr, int tid)
+{
+  int 
+    s,
+    model,
+    modelStates[2] = {4, 20},
+    numberOfPartitions[2] = {0 , 0},
+      arrayLength = sizeof(modelStates) / sizeof(int);
+
+      /* check that we have not addedd any new models for data types with a different number of states
+	 and forgot to update modelStates */
+
+      for(model = 0; model < pr->numberOfPartitions; model++)
+	{        
+	  pllBoolean 
+	    exists = PLL_FALSE;
+
+	  for(s = 0; s < arrayLength; s++)
+	    {
+	      exists = exists || (pr->partitionData[model]->states == modelStates[s]);
+	      if(pr->partitionData[model]->states == modelStates[s])
+		numberOfPartitions[s] += 1;
+	    }
+
+	  assert(exists);
+	}
+
+      for(s = 0; s < arrayLength; s++)
+	{
+	  if(numberOfPartitions[s] > 0)
+	    {
+	      size_t   
+		checkSum = 0,
+		sum = 0;
+
+	      int    
+		i,
+		k,
+#ifndef _FINE_GRAIN_MPI
+		n = tr->numberOfThreads,
+#else
+		n = processes,
+#endif
+		p = numberOfPartitions[s],    
+		*assignments = (int *)rax_calloc((size_t)n, sizeof(int));  
+
+	      partitionType 
+		*pt = (partitionType *)rax_malloc(sizeof(partitionType) * (size_t)p);
+
+
+
+	      for(i = 0, k = 0; i < pr->numberOfPartitions; i++)
+		{
+		  if(pr->partitionData[i]->states == modelStates[s])
+		    {
+		      pt[k].partitionNumber = i;
+		      pt[k].partitionLength = pr->partitionData[i]->upper - pr->partitionData[i]->lower;
+		      sum += (size_t)pt[k].partitionLength;
+		      k++;
+		    }
+		}
+
+	      assert(k == p);
+
+	      qsort(pt, p, sizeof(partitionType), partCompare);    
+
+	      for(i = 0; i < p; i++)
+		{
+		  int 
+		    k, 
+		    min = INT_MAX,
+		    minIndex = -1;
+
+		  for(k = 0; k < n; k++)	
+		    if(assignments[k] < min)
+		      {
+			min = assignments[k];
+			minIndex = k;
+		      }
+
+		  assert(minIndex >= 0);
+
+		  assignments[minIndex] +=  pt[i].partitionLength;
+		  assert(pt[i].partitionNumber >= 0 && pt[i].partitionNumber < pr->numberOfPartitions);
+		  pr->partitionData[pt[i].partitionNumber]->partitionAssignment = minIndex;
+		}
+
+              
+              /* Process i gets assignments[i] sites for modelStates[s] state model */
+
+	      for(i = 0; i < n; i++)
+		checkSum += (size_t)assignments[i];
+
+	      assert(sum == checkSum);
+
+	      rax_free(assignments);
+	      rax_free(pt);
+	    }
+	}
+}
+
+
+
+/** @brief Reduce the first and second derivative of the likelihood
+    function.
+    
+    We collect the first and second derivatives from the various
+    threads and sum them up. It's similar to what we do in
+    pllEvaluateGeneric() with the only difference that we have to collect
+    two values (firsrt and second derivative) instead of onyly one (the
+    log likelihood
+
+   @warning operates on global reduction buffers \a globalResult
+   
+   @param tr tree 
+   @param dlnLdlz first derivative
+   @param d2lnLdlz2 second derivative
+*/
+void branchLength_parallelReduce(pllInstance *tr, double *dlnLdlz,  double *d2lnLdlz2, int numBranches )
+{
+#ifdef _REPRODUCIBLE_MPI_OR_PTHREADS
+
+  /* only the master executes this  */
+  assert(tr->threadID == 0); 
+  
+  int b; 
+  int t; 
+  for(b = 0; b < numBranches; ++b)
+    {
+      dlnLdlz[b] = 0; 
+      d2lnLdlz2[b] = 0; 
+
+      for(t = 0; t < tr->numberOfThreads; ++t)
+	{
+	  dlnLdlz[b] += globalResult[t * numBranches * 2 + b ];
+	  d2lnLdlz2[b] += globalResult[t * numBranches * 2 + numBranches + b];
+	}
+    }
+#else 
+  memcpy(dlnLdlz, globalResult, sizeof(double) * numBranches);
+  memcpy(d2lnLdlz2, globalResult + numBranches, sizeof(double) * numBranches);
+#endif
+}
+
+
+
+/** @brief Read from buffer or writes rates into buffer.  Return
+    number of elems written.
+
+    If \a read is set to \b PLL_TRUE, then the contents \a srcTar are
+    copied to \a buf. Otherwise, the contents of \a buf are moved to
+    \a srcTar.
+   
+   @param buf 
+     Buffer
+
+   @param srcTar 
+     Pointer to either source or destination array
+
+   @param tr
+     PLL instance
+
+   @param n number of workers
+
+   @param tid process id
+
+   @param read 
+     If read-mode then set to \b PLL_TRUE
+
+   @param countOnly
+     if \b PLL_TRUE, simply return the number of elements
+*/
+static int doublesToBuffer(double *buf, double *srcTar, pllInstance *tr, partitionList *pr, int n, int tid, pllBoolean read, pllBoolean countOnly)
+{
+  int 
+    model,
+    i;
+  double 
+    *initPtr = buf; 
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      if(tr->manyPartitions)
+	{
+	  if(isThisMyPartition(pr, tid, model))
+	    for(i = pr->partitionData[model]->lower; i < pr->partitionData[model]->upper; i++)
+	      {
+		if(NOT countOnly)
+		  {
+		    if(read)
+		      *buf = srcTar[i]; 
+		    else 
+		      srcTar[i] = *buf; 
+		  }
+		buf++;
+	      }	  
+	}      
+      else
+	{
+	  for(i = pr->partitionData[model]->lower; i < pr->partitionData[model]->upper; i++)
+	    if(i % n == tid)
+	      {
+		if(NOT countOnly)
+		  {
+		    if(read)
+		      *buf = srcTar[i];
+		    else 
+		      srcTar[i] = *buf; 
+		  }
+		buf++; 
+	      }
+	}
+    }
+  
+  return buf - initPtr; 
+}
+
+
+
+
+/** @brief broadcast rates after rate optimization. 
+    
+    @param tre Library instance
+    @param localTree local library instance 
+    @param n number of workers 
+    @param tid worker id 
+    
+    @todo mpi_alltoallv/w may be more efficient, but it is a hell to set up
+ */ 
+void broadcastAfterRateOpt(pllInstance *tr, pllInstance *localTree, partitionList *pr, int n, int tid)
+{				  
+  int
+    num1 = 0,
+    num2 = 0,
+    num3 = 0, 
+    i ; 
+    
+  for(i = 0; i < n; ++i)
+    {
+      double
+	allBuf[tr->originalCrunchedLength * 3],
+	buf1[tr->originalCrunchedLength],
+	buf2[tr->originalCrunchedLength], 
+	buf3[tr->originalCrunchedLength]; 
+
+#ifdef _USE_PTHREADS
+      if(i != tid)
+	continue; 
+#endif
+      int numDouble = 0; 
+      
+      /* extract doubles  */
+
+      num1 = doublesToBuffer(buf1, localTree->patrat, tr, pr, n,i, PLL_TRUE, i!= tid);
+      num2 = doublesToBuffer(buf2, localTree->patratStored, tr, pr, n,i, PLL_TRUE, i!= tid);
+      num3 = doublesToBuffer(buf3, localTree->lhs, tr, pr, n,i, PLL_TRUE, i!= tid);
+
+      /* printf("%d + %d + %d\n", num1, num2, num3);  */
+
+      numDouble += num1 + num2 + num3; 
+
+      /* copy doubles  */
+      
+      memcpy(allBuf, buf1, num1 * sizeof(double)); 
+      memcpy(allBuf + num1, buf2, num2 * sizeof(double)); 
+      memcpy(allBuf + (num1 + num2) , buf3, num3 * sizeof(double)); 
+
+      BCAST_BUF(allBuf, numDouble, MPI_DOUBLE, i); 
+
+      memcpy(buf1, allBuf, num1 * sizeof(double)); 
+      memcpy(buf2, allBuf + num1, num2 * sizeof(double)); 
+      memcpy(buf3, allBuf + (num1 + num2), num3 * sizeof(double)); 
+      
+      /* re-insert doubles  */
+      int assertCtr = 0; 
+      assertCtr += doublesToBuffer(buf1, tr->patrat, tr, pr, n,i,PLL_FALSE, PLL_FALSE);
+      assertCtr += doublesToBuffer(buf2, tr->patratStored, tr, pr, n,i,PLL_FALSE, PLL_FALSE);
+      assertCtr += doublesToBuffer(buf3, tr->lhs, tr, pr, n,i,PLL_FALSE, PLL_FALSE);
+
+      assert(assertCtr == numDouble); 
+    }
+}
+
+
+/** @brief Collect doubles from workers to master.
+ 
+    
+
+    @param dst destination array
+    @param src source array
+    @param tr library instance 
+    @param n number of workers 
+    @param tid worker id 
+ */
+static void collectDouble(double *dst, double *src, pllInstance *tr, partitionList *pr, int n, int tid)
+{
+#ifdef _FINE_GRAIN_MPI    
+  int
+    assertNum = 0,
+    i, 
+    displacements[tr->numberOfThreads];
+  double 
+    buf[tr->originalCrunchedLength],
+    resultBuf[tr->originalCrunchedLength]; 
+
+  /* NOTE: This was moved here because it was an additional unnecessary move for the PTHREADS version. I didnt
+  have time to check the MPI version, have to get back to this and remove it */
+  /* gather own persite log likelihood values into local buffer  */
+  int numberCollected = doublesToBuffer(buf, src, tr, pr,n,tid,PLL_TRUE, PLL_FALSE);
+
+  /* this communicates all the values to the master */
+  
+  int numberPerWorker[tr->numberOfThreads];     
+  if(MASTER_P)			/* master counts number to receive, receives and writes back */
+    {
+      for(i = 0; i < n; ++i)
+	{
+	  numberPerWorker[i] = doublesToBuffer(buf,src,tr,pr,n,i,PLL_FALSE, PLL_TRUE);
+	  displacements[i] = i == 0 ? 0 : displacements[i-1] + numberPerWorker[i-1]; 
+	}
+      
+      MPI_Gatherv(buf, numberCollected, MPI_DOUBLE,
+		  resultBuf, numberPerWorker, displacements,  MPI_DOUBLE,
+		  0, MPI_COMM_WORLD); 
+
+      double *bufPtr = resultBuf; 
+      for(i = 0 ; i < n; ++i)
+	{
+	  int numberWritten = doublesToBuffer(bufPtr, dst,tr,pr,n,i, PLL_FALSE, PLL_FALSE);
+	  bufPtr += numberWritten; 
+	  assertNum += numberWritten; 
+	}    
+      
+      assert(assertNum == tr->originalCrunchedLength);
+    }
+  else 				/* workers only send their buffer   */
+    MPI_Gatherv(buf, numberCollected, MPI_DOUBLE, resultBuf, numberPerWorker, displacements, MPI_DOUBLE, 0, MPI_COMM_WORLD);   
+#else 
+  /* pthread version only writes to global space  */  
+
+  //assertNum = doublesToBuffer(buf, dst,tr,pr,n,tid, PLL_FALSE, PLL_FALSE);
+  doublesToBuffer (dst, src, tr, pr, n, tid, PLL_TRUE, PLL_FALSE);
+  //assert(assertNum == numberCollected); 
+#endif
+}
+
+
+
+/** @brief broadcast a new alpha (for the GAMMA model)
+    @param localTree local library instance
+    @param tr library instance
+    @param tid worker id 
+ */
+static void broadCastAlpha(partitionList *localPr, partitionList *pr)
+{
+  int  i, 
+    model; 
+
+#ifdef _FINE_GRAIN_MPI
+    int bufSize = localPr->numberOfPartitions * 4 * sizeof(double);
+  char bufDbl[bufSize]; 
+  char *bufPtrDbl = bufDbl;   
+#endif
+
+  RECV_BUF(bufDbl, bufSize, MPI_BYTE); 
+
+  for(model = 0; model < localPr->numberOfPartitions; model++)
+    for(i = 0; i < 4; ++i)
+      ASSIGN_BUF_DBL(localPr->partitionData[model]->gammaRates[i], pr->partitionData[model]->gammaRates[i]);
+  
+  SEND_BUF(bufDbl, bufSize, MPI_BYTE);  
+}
+
+/** @brief broadcast new LG4X weights
+    @param localTree local library instance
+    @param tr library instance
+    @param tid worker id
+ */
+static void broadCastLg4xWeights(partitionList *localPr, partitionList *pr)
+{
+  int  i,
+    model;
+
+#ifdef _FINE_GRAIN_MPI
+    int bufSize = localPr->numberOfPartitions * 4 * sizeof(double);
+  char bufDbl[bufSize];
+  char *bufPtrDbl = bufDbl;
+#endif
+
+  RECV_BUF(bufDbl, bufSize, MPI_BYTE);
+
+  for(model = 0; model < localPr->numberOfPartitions; model++)
+    for(i = 0; i < 4; ++i)
+      ASSIGN_BUF_DBL(localPr->partitionData[model]->lg4x_weights[i], pr->partitionData[model]->lg4x_weights[i]);
+
+  SEND_BUF(bufDbl, bufSize, MPI_BYTE);
+}
+
+static void copyLG4(partitionList *localPr, partitionList *pr)
+{
+    int model, i, k;
+
+    /* determine size of buffer needed first */
+    int bufSize = 0;
+
+#ifdef _FINE_GRAIN_MPI
+    for(model = 0; model < localPr->numberOfPartitions; ++model )
+      {
+        const partitionLengths *pl = getPartitionLengths(pr->partitionData[model]);
+        bufSize += 4*(pl->eignLength + pl->evLength + pl->eiLength + pl->tipVectorLength + pl->substRatesLength + pl->frequenciesLength) * sizeof(double) ;
+      }
+#endif
+
+    char
+      bufDbl[bufSize];
+    char *bufPtrDbl = bufDbl;
+
+    RECV_BUF(bufDbl, bufSize, MPI_BYTE);
+
+    for (model = 0; model < localPr->numberOfPartitions; model++)
+    {
+        pInfo * localInfo = localPr->partitionData[model];
+        pInfo * info = pr->partitionData[model];
+
+        if (info->protModels == PLL_LG4M || info->protModels == PLL_LG4X)
+        {
+            for (k = 0; k < 4; k++)
+            {
+                const partitionLengths *pl = getPartitionLengths(pr->partitionData[model]);
+
+                for (i = 0; i < pl->eignLength; ++i)
+                    ASSIGN_BUF_DBL(
+                            localPr->partitionData[model]->EIGN_LG4[k][i],
+                            pr->partitionData[model]->EIGN_LG4[k][i]);
+                for (i = 0; i < pl->evLength; ++i)
+                    ASSIGN_BUF_DBL(localPr->partitionData[model]->EV_LG4[k][i],
+                            pr->partitionData[model]->EV_LG4[k][i]);
+                for (i = 0; i < pl->eiLength; ++i)
+                    ASSIGN_BUF_DBL(localPr->partitionData[model]->EI_LG4[k][i],
+                            pr->partitionData[model]->EI_LG4[k][i]);
+                for (i = 0; i < pl->substRatesLength; ++i)
+                    ASSIGN_BUF_DBL(
+                            localPr->partitionData[model]->substRates_LG4[k][i],
+                            pr->partitionData[model]->substRates_LG4[k][i]);
+                for (i = 0; i < pl->frequenciesLength; ++i)
+                    ASSIGN_BUF_DBL(
+                            localPr->partitionData[model]->frequencies_LG4[k][i],
+                            pr->partitionData[model]->frequencies_LG4[k][i]);
+                for (i = 0; i < pl->tipVectorLength; ++i)
+                    ASSIGN_BUF_DBL(
+                            localPr->partitionData[model]->tipVector_LG4[k][i],
+                            pr->partitionData[model]->tipVector_LG4[k][i]);
+            }
+        }
+    }
+    SEND_BUF(bufDbl, bufSize, MPI_BYTE); /*  */
+}
+
+/** @brief Master broadcasts rates.
+    
+    @param localTree local library instance
+    @param tr library instance
+    @param tid worker id     
+ */ 
+static void broadCastRates(partitionList *localPr, partitionList *pr)
+{
+  int 
+    model;
+
+  /* determine size of buffer needed first */
+  int bufSize = 0;
+#ifdef _FINE_GRAIN_MPI
+  for(model = 0; model < localPr->numberOfPartitions; ++model )
+    {	  
+      const partitionLengths *pl = getPartitionLengths(pr->partitionData[model]); /* this is constant, isnt it?  */
+      bufSize += (pl->eignLength + pl->evLength + pl->eiLength + pl->tipVectorLength) * sizeof(double) ;
+    }
+#endif
+
+  char
+      bufDbl[bufSize];
+    char *bufPtrDbl = bufDbl;
+
+  RECV_BUF(bufDbl, bufSize, MPI_BYTE);
+  int i ; 
+
+  for(model = 0; model < localPr->numberOfPartitions; model++)
+    {
+      const partitionLengths *pl = getPartitionLengths(pr->partitionData[model]); /* this is constant, isnt it?  */
+
+      for(i = 0; i < pl->eignLength; ++i)
+	ASSIGN_BUF_DBL(localPr->partitionData[model]->EIGN[i], pr->partitionData[model]->EIGN[i]);
+      for(i = 0; i < pl->evLength; ++i)
+	ASSIGN_BUF_DBL(localPr->partitionData[model]->EV[i],pr->partitionData[model]->EV[i]);
+      for(i = 0; i  < pl->eiLength; ++i)
+	ASSIGN_BUF_DBL(localPr->partitionData[model]->EI[i], pr->partitionData[model]->EI[i]);
+      for(i = 0; i < pl->tipVectorLength; ++i)
+	ASSIGN_BUF_DBL(localPr->partitionData[model]->tipVector[i],   pr->partitionData[model]->tipVector[i]);
+    }
+  SEND_BUF(bufDbl, bufSize, MPI_BYTE); /*  */
+
+  copyLG4(localPr, pr);
+}
+
+/** @brief Evaluate the likelihood of this topology (PThreads/MPI implementation)
+
+    Evaluate the likelihood of the topology described in the PLL instance. First
+    every thread calls \a pllEvaluateIterative where it computes the log likelihoods
+    for the  portion of each assigned partition. The results (for all partition) are stored
+    as elements of a local buffer array (\a buf). This is done by all threads. Subsequently, 
+    an \a MPI_Reduce operation sums the contents of corresponding elements of the local
+    buffer arrays into another array (\a targetBuf) which are the log likelihoods of
+    each (complete) partition. Finally, the last array is copied to the master thread/process.
+    In addition, if \a getPerSiteLikelihoods is enabled the log likelihoods for each site
+    in the (compressed) alignment are stored in the array \a tr->lhs.
+
+    @param tr
+      PLL instance
+    @param tr
+      Local (thread/process) PLL instance
+
+    @param pr
+      Local (thread/process) list of partitions
+
+    @param tid
+      Thread/Process ID
+
+    @param getPerSiteLikelihoods 
+      If set to \b PLL_TRUE, compute the log likelihood for each site. 
+ */ 
+static void reduceEvaluateIterative(pllInstance *tr, pllInstance *localTree, partitionList *localPr, int tid, pllBoolean getPerSiteLikelihoods)
+{
+  int model;
+
+  pllEvaluateIterative(localTree, localPr, getPerSiteLikelihoods);
+
+  /* when this is done we need to write the per-thread log likelihood to the 
+     global reduction buffer. Tid is the thread ID, hence thread 0 will write its 
+     results to reductionBuffer[0] thread 1 to reductionBuffer[1] etc.
+
+     the actual sum over the entries in the reduction buffer will then be computed 
+     by the master thread which ensures that the sum is determinsitic */
+
+  
+  /* if (getPerSiteLikelihoods == PLL_TRUE) store per-site likelihoods in array tr->lhs */
+  if(getPerSiteLikelihoods)
+    {    
+#ifdef _FINE_GRAIN_MPI
+      int n = processes; 
+#else 
+      int n = tr->numberOfThreads; 
+#endif
+
+      /* rearrange per site likelihoods into single local array for gathering */
+      int i ; 
+      for(model = 0; model < localPr->numberOfPartitions; ++model)
+	{
+	  pInfo *partition = localPr->partitionData[model]; 
+	  pllBoolean isMyPartition  = isThisMyPartition(localPr, tid, model);
+
+	  int ctr = 0; 
+	  for(i = partition->lower; i < partition->upper; ++i)
+	    {
+	      if(tr->manyPartitions && isMyPartition)
+		localTree->lhs[i] = partition->perSiteLikelihoods[ ctr++]; 
+	      else if(NOT tr->manyPartitions && (i % n) == tid)
+		localTree->lhs[i] = partition->perSiteLikelihoods[ctr++];
+	    }
+	}
+      
+      /* gather all the double into the global array */
+      collectDouble(tr->lhs, localTree->lhs, localTree, localPr,  n, tid); 
+    }
+
+  /* printf("collecting done\n" ); */
+#ifdef _REPRODUCIBLE_MPI_OR_PTHREADS
+  /* 
+     aberer: I implemented this as a mpi_gather operation into this buffer, 
+     pthreads version emulates this gather; 
+     master takes care of the reduction; 
+  */
+
+  double 
+    buf[localPr->numberOfPartitions];
+
+  for(model = 0; model < localPr->numberOfPartitions; ++model)
+    buf[model] = localPr->partitionData[model]->partitionLH;
+
+  /* either make reproducible or efficient */
+  ASSIGN_GATHER(globalResult, buf, localPr->numberOfPartitions, PLL_DOUBLE, tid);
+
+  /* printf("gather worked\n"); */
+#else 
+  /* the efficient mpi version: a proper reduce  */
+  double 
+    buf[localPr->numberOfPartitions];
+  
+  for(model = 0; model < localPr->numberOfPartitions; ++model)
+    buf[model] = localPr->partitionData[model]->partitionLH;
+
+  double 
+    targetBuf[localPr->numberOfPartitions];
+  
+  memset(targetBuf, 0, sizeof(double) * localPr->numberOfPartitions);
+
+  MPI_Reduce(buf, targetBuf, localPr->numberOfPartitions, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+  
+  if(MASTER_P) 
+    {
+      for(model = 0; model < localPr->numberOfPartitions; ++model) {
+	localPr->partitionData[model]->partitionLH = targetBuf[model];
+      }
+    }
+#endif
+}
+
+
+
+/*@ @brief Broadcast the traversal descriptor to worker threads. 
+
+  The one below is a hack we are re-assigning the local pointer to
+  the global one the memcpy version below is just for testing and
+  preparing the fine-grained MPI BlueGene version
+
+  @param localTree local library instance
+  @param tr library instance
+*/
+/* TODO: we should reset this at some point, the excplicit copy is just done for testing */
+__inline static void broadcastTraversalInfo(pllInstance *localTree, pllInstance *tr, partitionList *localPr)
+{
+  /* @todo these two regions could be joined */
+#ifdef _USE_PTHREADS
+  /* memcpy -> memmove (see ticket #43). This function is sometimes called with localTree == tr,
+   * in which case some memcpy implementations can corrupt the buffers.
+   */
+  
+  localTree->td[0].functionType =            tr->td[0].functionType;
+  localTree->td[0].count =                   tr->td[0].count ;
+  localTree->td[0].traversalHasChanged =     tr->td[0].traversalHasChanged;
+
+  memmove(localTree->td[0].executeModel,    tr->td[0].executeModel,    sizeof(pllBoolean) * localPr->numberOfPartitions);
+  memmove(localTree->td[0].parameterValues, tr->td[0].parameterValues, sizeof(double) * localPr->numberOfPartitions);
+  
+  if(localTree->td[0].traversalHasChanged)
+    memmove(localTree->td[0].ti, tr->td[0].ti, localTree->td[0].count * sizeof(traversalInfo));
+
+#else
+  /* MPI */
+  /* like in raxml-light: first we send a small message, if the
+     travesalDescriptor is longer, then resend */
+  
+  int length = treeIsInitialized ? localPr->numberOfPartitions : 0;
+  char broadCastBuffer[messageSize(length)]; 
+  char *bufPtr = broadCastBuffer; 
+  int i; 
+
+  RECV_BUF(broadCastBuffer, messageSize(length), MPI_BYTE); 
+
+  ASSIGN_BUF(localTree->td[0].functionType, tr->td[0].functionType , int);   
+  ASSIGN_BUF(localTree->td[0].count,  tr->td[0].count , int); 
+  ASSIGN_BUF(localTree->td[0].traversalHasChanged, tr->td[0].traversalHasChanged , int); 
+
+  if(treeIsInitialized)  
+    { 
+      for(i = 0; i < localPr->numberOfPartitions; ++i)
+	{
+	  ASSIGN_BUF(localTree->td[0].executeModel[i],      tr->td[0].executeModel[i], int); 
+	  ASSIGN_BUF(localTree->td[0].parameterValues[i],	 tr->td[0].parameterValues[i], double); 
+	}      
+
+      for(i = 0; i < TRAVERSAL_LENGTH; ++i )
+	ASSIGN_BUF(localTree->td[0].ti[i], tr->td[0].ti[i], traversalInfo); 
+    }
+    
+  SEND_BUF(broadCastBuffer, messageSize(length), MPI_BYTE); 
+
+  /* now we send the second part of the traversal descriptor, if we
+     exceed the pre-set number of elements */
+  if(treeIsInitialized && localTree->td[0].count > TRAVERSAL_LENGTH) 
+    {
+      /* lets use the MPI_Datatype for this thing, what I've read it's
+	 supposed to be more secure and efficient */
+      MPI_Bcast(localTree->td[0].ti + TRAVERSAL_LENGTH, localTree->td[0].count - TRAVERSAL_LENGTH, TRAVERSAL_MPI, 0, MPI_COMM_WORLD );
+    }
+#endif
+}
+
+
+/** @brief helper that yields a string representation of a parallel region. 
+    
+    @param type type of parallel region
+ */ 
+char* getJobName(int type)
+{
+  switch(type)  
+    {
+    case  PLL_THREAD_NEWVIEW:       
+      return "PLL_THREAD_NEWVIEW";
+    case PLL_THREAD_EVALUATE: 
+      return "PLL_THREAD_EVALUATE";
+    case PLL_THREAD_MAKENEWZ: 
+      return "PLL_THREAD_MAKENEWZ";
+    case PLL_THREAD_MAKENEWZ_FIRST: 
+      return "PLL_THREAD_MAKENEWZ_FIRST";
+    case PLL_THREAD_RATE_CATS: 
+      return "PLL_THREAD_RATE_CATS";
+    case PLL_THREAD_COPY_RATE_CATS: 
+      return "PLL_THREAD_COPY_RATE_CATS";
+    case PLL_THREAD_COPY_INIT_MODEL: 
+      return "PLL_THREAD_COPY_INIT_MODEL";
+    case PLL_THREAD_INIT_PARTITION: 
+      return "PLL_THREAD_INIT_PARTITION";
+    case PLL_THREAD_OPT_ALPHA: 
+      return "PLL_THREAD_OPT_ALPHA";
+    case PLL_THREAD_OPT_RATE: 
+      return "PLL_THREAD_OPT_RATE";
+    case PLL_THREAD_COPY_ALPHA: 
+      return "PLL_THREAD_COPY_ALPHA";
+    case PLL_THREAD_COPY_RATES: 
+      return "PLL_THREAD_COPY_RATES";
+    case PLL_THREAD_PER_SITE_LIKELIHOODS: 
+      return "PLL_THREAD_PER_SITE_LIKELIHOODS";
+    case PLL_THREAD_NEWVIEW_ANCESTRAL: 
+      return "PLL_THREAD_NEWVIEW_ANCESTRAL";
+    case PLL_THREAD_GATHER_ANCESTRAL: 
+      return "PLL_THREAD_GATHER_ANCESTRAL";
+    case PLL_THREAD_EXIT_GRACEFULLY: 
+      return "PLL_THREAD_EXIT_GRACEFULLY";
+    case PLL_THREAD_EVALUATE_PER_SITE_LIKES:
+      return "PLL_THREAD_EVALUATE_PER_SITE_LIKES";
+    default: assert(0); 
+    }
+}
+
+/**
+   @brief Generic entry point for parallel regions (mostly broadcasts
+   traversal descriptor first).
+
+   This function here handles all parallel regions in the Pthreads
+   version, when we enter this function pllMasterBarrier() has been called
+   by the master thread from within the sequential part of the
+   program, tr is the library instance (tree) at the master thread, 
+   localTree is the library instance (tree) at the worker threads
+
+   While this is not necessary, adress spaces of threads are indeed
+   separated for easier transition to a distributed memory paradigm
+   
+   @param tr library instance
+   @param localTree local library instance 
+   @param tid worker id 
+   @param n number of workers 
+*/
+static pllBoolean execFunction(pllInstance *tr, pllInstance *localTree, partitionList *pr, partitionList *localPr, int tid, int n)
+{
+  int
+    i,
+    model,
+    localCounter;
+
+#ifdef MEASURE_TIME_PARALLEL
+  double timeForParallelRegion = gettime();
+#endif
+
+
+#ifdef _USE_PTHREADS
+  /* some stuff associated with the barrier implementation using Pthreads and busy wait */
+  int currentJob = threadJob >> 16;
+#endif
+
+  /* here the master sends and all threads/processes receive the traversal descriptor */
+  broadcastTraversalInfo(localTree, tr, localPr);
+
+#ifdef _USE_PTHREADS
+  /* make sure that nothing is going wrong */
+  assert(currentJob == localTree->td[0].functionType);
+#else   
+  localTree = tr; 
+  int currentJob = localTree->td[0].functionType; 
+#endif
+
+#ifdef DEBUG_PARALLEL
+  printf("[%d] working on %s\n", tid, getJobName(currentJob)); 
+#endif  
+
+  switch(currentJob)
+    { 
+    case PLL_THREAD_NEWVIEW: 
+      /* just a newview on the fraction of sites that have been assigned to this thread */
+
+      pllNewviewIterative(localTree, localPr, 0);
+      break;     
+    case PLL_THREAD_EVALUATE: 
+      reduceEvaluateIterative(tr, localTree, localPr, tid, PLL_FALSE);
+      break;	
+    case PLL_THREAD_MAKENEWZ_FIRST:
+
+      /* this is the first call from within makenewz that requires getting the likelihood vectors to the left and 
+         right of the branch via newview and doing some precomputations.
+	 
+         For details see comments in makenewzGenericSpecial.c 
+      */
+    case  PLL_THREAD_MAKENEWZ:
+      {	
+	double
+	  dlnLdlz[PLL_NUM_BRANCHES],
+	  d2lnLdlz2[PLL_NUM_BRANCHES]; 
+
+	if(localTree->td[0].functionType == PLL_THREAD_MAKENEWZ_FIRST)
+	  makenewzIterative(localTree, localPr);
+	execCore(localTree, localPr, dlnLdlz, d2lnLdlz2);
+
+	/* gather the first and second derivatives that have been written by each thread */
+	/* as for evaluate above, the final sum over the derivatives will be computed by the 
+	   master thread in its sequential part of the code */
+
+	int numBranches = localPr->perGeneBranchLengths?localPr->numberOfPartitions:1;
+
+#ifdef _REPRODUCIBLE_MPI_OR_PTHREADS
+	/* MPI: implemented as a gather again, pthreads: just buffer copying */	
+	double buf[ 2 * numBranches];
+	memcpy( buf, dlnLdlz, numBranches * sizeof(double) );
+	memcpy(buf + numBranches, d2lnLdlz2, numBranches * sizeof(double));
+
+	ASSIGN_GATHER(globalResult, buf,  2 * numBranches, PLL_DOUBLE, tid);
+#else 	
+	double result[numBranches];
+	memset(result,0, numBranches * sizeof(double));
+	MPI_Reduce( dlnLdlz , result , numBranches, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+	if(MASTER_P)
+	  memcpy(globalResult, result, sizeof(double) * numBranches);
+	
+	memset(result,0,numBranches * sizeof(double));
+	MPI_Reduce( d2lnLdlz2 , result , numBranches, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+	if(MASTER_P)
+	  memcpy(globalResult + numBranches, result, sizeof(double) * numBranches);
+#endif
+      }
+
+      break;
+
+    case PLL_THREAD_INIT_PARTITION:       
+
+      /* broadcast data and initialize and allocate arrays in partitions */
+      
+      initializePartitionsMaster(tr, localTree, pr, localPr, tid, n);
+
+      break;          
+    case PLL_THREAD_COPY_ALPHA: 
+    case PLL_THREAD_OPT_ALPHA:
+      /* this is when we have changed the alpha parameter, inducing a change in the discrete gamma rate categories.
+	 this is called when we are optimizing or sampling (in the Bayesioan case) alpha parameter values */
+      
+      /* distribute the new discrete gamma rates to the threads */
+      broadCastAlpha(localPr,pr);
+
+      /* compute the likelihood, note that this is always a full tree traversal ! */
+      if(localTree->td[0].functionType == PLL_THREAD_OPT_ALPHA)
+	reduceEvaluateIterative(tr, localTree, localPr, tid, PLL_FALSE);
+
+      break;
+    case PLL_THREAD_OPT_RATE:
+    case PLL_THREAD_COPY_RATES:
+
+      /* if we are optimizing the rates in the transition matrix Q this induces recomputing the eigenvector eigenvalue 
+	 decomposition and the tipVector as well because of the special numerics in RAxML, the matrix of eigenvectors 
+	 is "rotated" into the tip lookup table.
+
+	 Hence if the sequential part of the program that steers the Q matrix rate optimization has changed a rate we
+	 need to broadcast all eigenvectors, eigenvalues etc to each thread 
+      */
+
+      broadCastRates(localPr, pr);
+
+      /* now evaluate the likelihood of the new Q matrix, this always requires a full tree traversal because the changes need
+	 to be propagated throughout the entire tree */
+
+      if(localTree->td[0].functionType == PLL_THREAD_OPT_RATE)
+	reduceEvaluateIterative(tr, localTree, localPr, tid, PLL_FALSE);
+
+      break;
+    case PLL_THREAD_COPY_LG4X_RATES:
+
+        broadCastLg4xWeights(localPr, pr);
+        broadCastAlpha(localPr, pr);
+
+        assert(localPr->partitionData[0]->lg4x_weights[0] == pr->partitionData[0]->lg4x_weights[0]);
+
+        break;
+    case PLL_THREAD_OPT_LG4X_RATE:
+
+        broadCastLg4xWeights(localPr, pr);
+        broadCastAlpha(localPr, pr);
+
+        assert(localPr->partitionData[0]->lg4x_weights[0] == pr->partitionData[0]->lg4x_weights[0]);
+
+        /* compute the likelihood, note that this is always a full tree traversal ! */
+        reduceEvaluateIterative(tr, localTree, localPr, tid, PLL_FALSE);
+
+        break;
+    case PLL_THREAD_COPY_INIT_MODEL:
+      {
+
+	/* need to be very careful here ! PLL_THREAD_COPY_INIT_MODEL is also used when the program is restarted 
+	   it is hence not sufficient to just initialize everything by the default values ! */
+
+	broadCastRates(localPr, pr);
+	broadCastAlpha(localPr, pr); /* isnt that only executed when we are on gamma?  */
+	broadCastLg4xWeights(localPr, pr);
+
+	/*
+	  copy initial model parameters, the Q matrix and alpha are initially, when we start our likelihood search 
+	  set to default values. 
+	  Hence we need to copy all those values that are required for computing the likelihood 
+	  with newview(), evaluate() and makenez() to the private memory of the threads 
+	*/
+
+
+	if( localTree->rateHetModel == PLL_CAT) /* TRICKY originally this should only be executed by workers  */
+	  {
+#ifdef _FINE_GRAIN_MPI
+	    int bufSize = 2 * localTree->originalCrunchedLength * sizeof(double); 
+	    char bufDbl[bufSize], 
+	      *bufPtrDbl = bufDbl; 
+#endif
+
+	    RECV_BUF(bufDbl, bufSize,MPI_BYTE); 
+
+	    /* this should be local  */
+	    for(model = 0; model < localPr->numberOfPartitions; model++)
+	      localPr->partitionData[model]->numberOfCategories      = pr->partitionData[model]->numberOfCategories;
+
+
+	    /* this is only relevant for the PSR model, we can worry about this later */
+	    for(i = 0; i < localTree->originalCrunchedLength; ++i)
+	      {
+		ASSIGN_BUF_DBL(localTree->patrat[i], tr->patrat[i]);
+		ASSIGN_BUF_DBL(localTree->patratStored[i], tr->patratStored[i]); 
+	      }
+
+	    SEND_BUF(bufDbl, bufSize, MPI_BYTE); 
+	  }
+      } 
+      break;    
+    case PLL_THREAD_RATE_CATS: 
+      {
+	/* this is for optimizing per-site rate categories under PSR, let's worry about this later */
+
+	ASSIGN_DBL( localTree->lower_spacing,  tr->lower_spacing);
+	ASSIGN_DBL( localTree->upper_spacing,  tr->upper_spacing);
+
+	optRateCatPthreads(localTree, localPr, localTree->lower_spacing, localTree->upper_spacing, localTree->lhs, n, tid);
+
+	broadcastAfterRateOpt(tr, localTree, localPr, n,  tid);
+      }
+      break;
+    case PLL_THREAD_COPY_RATE_CATS:
+      {
+	/* 
+	   this is invoked when we have changed the per-site rate category assignment
+	   In essence it distributes the new per site rates to all threads 
+
+	   The pthread-version here simply assigns everything as ought to
+	   be. The MPI-version is configured to write to a buffer instead
+	   and SEND (master) or RECV (workers) it.
+
+	*/
+
+	/* 
+	   start of communication part 
+	*/
+
+	int i, 
+	  /* buf[localPr->numberOfPartitions], */
+	  /* assertCtr = 0,  */
+	  dblBufSize = 0; 
+
+#ifdef _FINE_GRAIN_MPI
+	int bufSize = localPr->numberOfPartitions * sizeof(int); 
+	char buf[bufSize]; 
+	char *bufPtr = buf; 
+#endif
+     
+	RECV_BUF(buf, bufSize, MPI_BYTE);
+
+	for( model = 0; model < localPr->numberOfPartitions; ++model)
+	  {
+	    ASSIGN_BUF(localPr->partitionData[model]->numberOfCategories, pr->partitionData[model]->numberOfCategories, int);
+	    dblBufSize += localPr->partitionData[model]->numberOfCategories * sizeof(double);
+	  }
+
+	SEND_BUF(buf, bufSize, MPI_BYTE); 
+
+
+	dblBufSize += 2 * localTree->originalCrunchedLength * sizeof(double); 
+
+#ifdef _FINE_GRAIN_MPI
+	char bufDbl[dblBufSize],
+	  *bufPtrDbl = bufDbl;
+#endif
+
+	RECV_BUF(bufDbl, dblBufSize, MPI_BYTE); 
+
+	for(i = 0; i < localTree->originalCrunchedLength; ++i)
+	  {	 
+	    ASSIGN_BUF_DBL(localTree->patrat[i], tr->patrat[i]); 
+	    ASSIGN_BUF_DBL(localTree->patratStored[i], tr->patratStored[i]); 
+	  }
+
+	for( model = 0; model < localPr->numberOfPartitions; ++model)
+	  for(i = 0; i < localPr->partitionData[model]->numberOfCategories; i++)
+	    ASSIGN_BUF_DBL(localPr->partitionData[model]->perSiteRates[i], pr->partitionData[model]->perSiteRates[i]);
+
+	SEND_BUF(bufDbl, dblBufSize, MPI_BYTE); 
+
+
+	/* lets test, if it is a good idea to send around the basic categories  */
+#ifdef _FINE_GRAIN_MPI
+	/* TODO this is inefficient, but is seems to have a small impact on performance */
+	MPI_Bcast(tr->rateCategory, tr->originalCrunchedLength, MPI_INT, 0, MPI_COMM_WORLD); 
+#endif
+
+
+	/* 
+	   now re-assign values 
+	*/
+	for(model = 0; model < localPr->numberOfPartitions; model++)
+	  {
+	    if(localTree->manyPartitions)
+	      {
+		if(isThisMyPartition(localPr, tid, model))
+		  for(localCounter = 0, i = localPr->partitionData[model]->lower;  i < localPr->partitionData[model]->upper; i++, localCounter++)
+		    {	     
+		      localPr->partitionData[model]->rateCategory[localCounter] = tr->rateCategory[i];
+		    } 
+	      }
+	    else	  
+	      {
+		for(localCounter = 0, i = localPr->partitionData[model]->lower;  i < localPr->partitionData[model]->upper; i++)
+		  {
+		    if(i % n == tid)
+		      {		 
+			localPr->partitionData[model]->rateCategory[localCounter] = tr->rateCategory[i];
+
+			localCounter++;
+		      }
+		  }
+	      }
+	  }
+      }
+      break;
+    case PLL_THREAD_PER_SITE_LIKELIHOODS:      
+      {
+
+	/* compute per-site log likelihoods for the sites/partitions 
+	   that are handled by this thread */
+	perSiteLogLikelihoodsPthreads(localTree, localPr, localTree->lhs, n, tid);
+
+	/* do a parallel gather operation, the threads will write their results 
+	   into the global buffer tr->lhs that will then contain all per-site log likelihoods
+	   in the proper order 
+	*/
+
+	collectDouble(tr->lhs,                localTree->lhs,                  localTree, localPr, n, tid);
+
+      }
+      break;
+      /* check for errors */
+    case PLL_THREAD_NEWVIEW_ANCESTRAL:       
+      assert(0);
+      break; 
+    case PLL_THREAD_GATHER_ANCESTRAL:
+      assert(0); 
+      break; 
+    case PLL_THREAD_EXIT_GRACEFULLY: 
+      {
+	/* cleans up the workers memory */
+
+#ifdef _USE_PTHREADS
+	/* TODO destroying the tree does not work yet in a highly
+	   generic manner. */
+
+	if(NOT MASTER_P)
+	  {
+	    pllPartitionsDestroy (localTree, &localPr);
+	    /* pllTreeDestroy (localTree); */
+	  }
+	else 
+	  {
+	    //pllPartitionsDestroy (tr, &pr);
+	    /* pllTreeDestroy (tr); */
+	  }
+
+#else 
+	//pllPartitionsDestroy (tr, &pr);
+	/* pllTreeDestroy (tr); */
+	
+	//MPI_Finalize();
+	//exit(0); 
+#endif	
+	return PLL_FALSE; 
+      }
+      break; 
+    case PLL_THREAD_EVALUATE_PER_SITE_LIKES: 
+      {
+	reduceEvaluateIterative(tr, localTree, localPr, tid, PLL_TRUE);
+      }
+      break;
+    default:
+      printf("Job %d\n", currentJob);
+      assert(0);
+    }
+
+  return PLL_TRUE; 
+}
+
+
+
+
+/**  Target function where the threads/processes are trapped
+
+     The threads/processes spend all of their time in this function
+     running operations on the data (computing likelihoods).
+
+     @param tData
+       Structure that contains the vital information for the thread/process, 
+       i.e. PLL instance, list of partitions and thread ID
+
+     @note
+       The data in \a tData are different for pthreads and MPI. 
+       Expand this section.
+ */ 
+static void *likelihoodThread(void *tData)
+{
+  threadData *td = (threadData*)tData;
+  pllInstance 
+    *tr = td->tr;
+  partitionList *pr = td->pr;
+
+#ifdef _USE_PTHREADS
+  pllInstance *localTree = rax_calloc(1,sizeof(pllInstance )); 
+  partitionList *localPr = rax_calloc(1,sizeof(partitionList));
+
+  int
+    myCycle = 0,
+    localTrap = 1;
+
+  const int 
+    n = td->tr->numberOfThreads,
+    tid = td->threadNumber;
+
+#ifndef _PORTABLE_PTHREADS
+  pinToCore(tid);
+#endif
+
+  /* printf("\nThis is RAxML Worker Pthread Number: %d\n", tid); */
+
+  while(localTrap)
+    {
+
+      while (myCycle == threadJob);
+      myCycle = threadJob;
+
+      if ((threadJob >> 16) != PLL_THREAD_INIT_PARTITION) {
+    	  localPr->perGeneBranchLengths = pr->perGeneBranchLengths;
+      	  localPr->numberOfPartitions = pr->numberOfPartitions;
+      }
+      localTrap = execFunction(tr, localTree, pr, localPr, tid, n);
+
+      barrierBuffer[tid] = 1;     
+    }
+    rax_free (localTree->td[0].executeModel); //localTree->td[0].executeModel = NULL;
+    rax_free (localTree->td[0].parameterValues); //localTree->td[0].parameterValues = NULL;
+    rax_free (localTree->rateCategory); //localTree->rateCategory = NULL;
+    rax_free (localTree->lhs); //localTree->lhs = NULL;
+    rax_free (localTree->patrat); //localTree->patrat = NULL;
+    rax_free (localTree->patratStored); //localTree->patratStored = NULL;
+    rax_free (localTree->td[0].ti); //localTree->td[0].ti = NULL;
+    rax_free (localTree);
+#else 
+  const int
+    n = processes, 
+    tid = td->threadNumber;
+  int i;
+
+  /* printf("\nThis is RAxML Worker Process Number: %d\n", tid); */
+
+  while(execFunction(tr, tr, pr, pr, tid,n));
+
+  rax_free (tr->lhs);
+  rax_free (tr->td[0].ti);
+  rax_free (tr->td[0].executeModel);
+  rax_free (tr->td[0].parameterValues);
+  rax_free (tr->patrat);
+  rax_free (tr->patratStored);
+  rax_free (tr->aliaswgt);
+  rax_free (tr->y_ptr);
+  for (i = 0; i < pr->numberOfPartitions; ++ i)
+    rax_free (pr->partitionData[i]);
+  rax_free (pr->partitionData);
+  rax_free (pr);
+  rax_free (tr);
+#endif
+
+  return (void*)NULL;
+}
+
+
+/**
+   @brief Cleanup step once the master barrier succeeded. 
+
+   This is master specific code called once the barrier is
+   passed. Stuff such as reduction operations.  If we execute this
+   here, we can keep the code mostly free from parallel -specific
+   code.
+   
+   @param tr 
+     PLL instance
+
+   @param pr
+     List of partitions
+
+   @param jobType 
+     Job that is to be executed
+*/
+void pllMasterPostBarrier(pllInstance *tr, partitionList *pr, int jobType)
+{
+  assert(tr->threadID == 0); 
+  
+  switch(jobType)
+    {
+    case PLL_THREAD_EVALUATE: 
+    case PLL_THREAD_OPT_RATE: 
+    case PLL_THREAD_OPT_ALPHA:
+    case PLL_THREAD_OPT_LG4X_RATE:
+    case PLL_THREAD_EVALUATE_PER_SITE_LIKES: 
+      {
+#ifdef _REPRODUCIBLE_MPI_OR_PTHREADS
+	int i,j;
+	volatile double partitionResult;	
+
+	for(j = 0; j < pr->numberOfPartitions; j++)
+	  {
+	    for(i = 0, partitionResult = 0.0; i < tr->numberOfThreads; i++) 
+	      partitionResult += globalResult[i * pr->numberOfPartitions+ j];
+
+	    pr->partitionData[j]->partitionLH = partitionResult;
+	  }
+#endif      
+
+	break; 
+      } 
+    case PLL_THREAD_PER_SITE_LIKELIHOODS:
+      {
+	int i; 
+	/* now just compute the sum over per-site log likelihoods for error checking */      
+	double accumulatedPerSiteLikelihood = 0.; 
+	for(i = 0; i < tr->originalCrunchedLength; i++)
+	  accumulatedPerSiteLikelihood += tr->lhs[i];
+
+	/* printf("RESULT: %f\t%f", tr->likelihood, accumulatedPerSiteLikelihood);  */
+	assert(PLL_ABS(tr->likelihood - accumulatedPerSiteLikelihood) < 0.00001);
+      }
+      break;
+    default: 
+      ; 			/* dont do anything on default,
+				   mostly, we can skip that */
+    } 
+}
+
+/**
+   @brief A generic master barrier for executing parallel parts of the code
+
+   A generic master barrier through which the master thread/process controls
+   the work job execution. Through the parameter \a jobType the master instructs
+   the slaves of what type of work they must conduct.
+
+   @param tr
+     PLL instance
+
+   @param pr
+     List of partitions
+
+   @param jobType 
+     Type of job to be conducted
+ */ 
+void pllMasterBarrier(pllInstance *tr, partitionList *pr, int jobType)
+{
+
+#ifdef MEASURE_TIME_PARALLEL
+  assert(jobType < NUM_PAR_JOBS); 
+  timePerRegion[NUM_PAR_JOBS]  += gettime()- masterTimePerPhase ; 
+  masterTimePerPhase = gettime();
+#endif
+
+#ifdef _USE_PTHREADS
+  const int 
+    n = tr->numberOfThreads;
+
+  tr->td[0].functionType = jobType;
+
+  jobCycle = !jobCycle;
+  threadJob = (jobType << 16) + jobCycle;
+
+  execFunction(tr, tr, pr, pr, 0, n);
+
+  int 
+    i, 
+    sum;
+
+  do
+    {
+      for(i = 1, sum = 1; i < n; i++)
+	sum += barrierBuffer[i];
+    }
+  while(sum < n);  
+
+  for(i = 1; i < n; i++)
+    barrierBuffer[i] = 0;
+#else 
+  tr->td[0].functionType = jobType; 
+  execFunction(tr,tr,pr,pr,0,processes);
+#endif
+
+  /* code executed by the master, once the barrier is crossed */
+  pllMasterPostBarrier(tr, pr, jobType);
+
+#ifdef MEASURE_TIME_PARALLEL
+  timePerRegion[jobType] += gettime() - masterTimePerPhase; 
+  masterTimePerPhase = gettime();
+#endif
+}
+
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+
+/** @brief Initialize structures for slave process/threads
+ 
+    Allocate all memory structures required by slave threads/processes
+
+    @param tr 
+      PLL Instance
+
+    @param localTree 
+      A local PLL instance for the slave process/thread which is initialized in this function based on \a tr
+
+    @pram pr
+      List of partitions
+
+    @param localPr
+      A local list of partitions for the slave process/thread which will be initialized based on \a pr 
+
+    @pram tid
+      The slave process/thread ID
+
+    @note
+      This function should never be called by the master thread, but is called by master process in MPI implementation.
+ */ 
+static void assignAndInitPart1(pllInstance *localTree, pllInstance *tr, partitionList *localPr, partitionList *pr, int *tid)
+{
+  size_t
+    model; 
+  int
+    totalLength = 0; 
+
+#ifdef _USE_PTHREADS
+  localTree->threadID = *tid; 
+  /* printf("my id is %d\n", *tid);  */
+  assert(localTree != tr);
+  localTree->numberOfThreads = tr->numberOfThreads;
+#else  /* => MPI */
+  *tid = processID; 
+  localTree->threadID = processID; 
+  tr->numberOfThreads = processes;
+
+  int bufSize = (9 + pr->numberOfPartitions* 8) * sizeof(int);
+  char buf[bufSize], 
+    *bufPtr = buf;  
+#endif
+
+  RECV_BUF(buf, bufSize, MPI_BYTE); 
+
+  ASSIGN_BUF( localTree->useRecom,                  tr->useRecom, int);
+  ASSIGN_BUF( localTree->rateHetModel,              tr->rateHetModel, int);
+  ASSIGN_BUF( localTree->useMedian,                 tr->useMedian, int); 
+  ASSIGN_BUF( localTree->saveMemory,                tr->saveMemory, int);
+  ASSIGN_BUF( localTree->maxCategories,             tr->maxCategories, int);
+  ASSIGN_BUF( localTree->originalCrunchedLength,    tr->originalCrunchedLength, int);
+  ASSIGN_BUF( localTree->mxtips,                    tr->mxtips, int);
+  ASSIGN_BUF( localPr->numberOfPartitions,          pr->numberOfPartitions, int);
+  ASSIGN_BUF( localPr->perGeneBranchLengths,        pr->perGeneBranchLengths, pllBoolean);
+
+  localTree->td[0].count = 0; 
+
+  if(NOT MASTER_P)
+    {
+      localTree->lhs                     = (double*)rax_calloc((size_t)localTree->originalCrunchedLength, sizeof(double));     
+      localPr->partitionData           = (pInfo**)rax_calloc(PLL_NUM_BRANCHES,sizeof(pInfo*));
+      for(model = 0; model < (size_t)localPr->numberOfPartitions; model++) {
+    	localPr->partitionData[model] = (pInfo*)rax_calloc(1,sizeof(pInfo));
+      }
+      localTree->td[0].ti              = (traversalInfo *)rax_malloc(sizeof(traversalInfo) * (size_t)localTree->mxtips);
+      localTree->td[0].executeModel    = (pllBoolean *)rax_malloc(sizeof(pllBoolean) * PLL_NUM_BRANCHES);
+      localTree->td[0].parameterValues = (double *)rax_malloc(sizeof(double) * PLL_NUM_BRANCHES);
+      localTree->patrat       = (double*)rax_malloc(sizeof(double) * (size_t)localTree->originalCrunchedLength);
+      localTree->patratStored = (double*)rax_malloc(sizeof(double) * (size_t)localTree->originalCrunchedLength);            
+    }
+  
+  for(model = 0; model < (size_t)localPr->numberOfPartitions; model++)
+    {
+      ASSIGN_BUF(localPr->partitionData[model]->numberOfCategories,     pr->partitionData[model]->numberOfCategories, int);
+      ASSIGN_BUF(localPr->partitionData[model]->states,                 pr->partitionData[model]->states, int);
+      ASSIGN_BUF(localPr->partitionData[model]->maxTipStates ,          pr->partitionData[model]->maxTipStates, int);
+      ASSIGN_BUF(localPr->partitionData[model]->dataType ,              pr->partitionData[model]->dataType, int);
+      ASSIGN_BUF(localPr->partitionData[model]->protModels ,            pr->partitionData[model]->protModels, int);
+      ASSIGN_BUF(localPr->partitionData[model]->protUseEmpiricalFreqs , pr->partitionData[model]->protUseEmpiricalFreqs, int);
+      ASSIGN_BUF(localPr->partitionData[model]->lower ,                 pr->partitionData[model]->lower, int);
+      ASSIGN_BUF(localPr->partitionData[model]->upper ,                 pr->partitionData[model]->upper, int);
+      ASSIGN_BUF(localPr->partitionData[model]->ascBias,                pr->partitionData[model]->ascBias, pllBoolean);
+
+      localPr->partitionData[model]->partitionLH = 0.0;      
+
+      totalLength += (localPr->partitionData[model]->upper -  localPr->partitionData[model]->lower);
+    }
+
+  SEND_BUF(buf, bufSize, MPI_BYTE); 
+
+  assert(totalLength == localTree->originalCrunchedLength);
+
+  ASSIGN_DBL(localTree->vectorRecomFraction, tr->vectorRecomFraction); 
+}
+#endif
+
+
+/** @brief Distribute y-vectors during initialization. 
+
+    Distribute the alignment data to the slave process/threads. Each slave
+    copies the data (alignment) from its assigned partition to its local 
+    partition structure.
+
+    @param tr 
+      PLL instance
+    
+    @param localTree 
+      Local library instance for the current thread
+
+    @param localPr
+      Local list of partitions structure for the current thread
+ */ 
+static void distributeYVectors(pllInstance *localTree, pllInstance *tr, partitionList *localPr)
+{
+  size_t 
+    i,
+    n = localTree->numberOfThreads,
+    globalCounter = 0,
+    localCounter = 0,
+    model = 0, 
+    j; 
+  int tid = localTree->threadID; 
+  
+
+  /* distribute the y-vectors */
+  for(j = 1 ; j <= (size_t)localTree->mxtips; j++)	
+    {
+#ifdef _FINE_GRAIN_MPI
+      unsigned char yBuf[tr->originalCrunchedLength]; 	  
+      if(MASTER_P)
+	memcpy(yBuf, tr->yVector[j], tr->originalCrunchedLength * sizeof(unsigned char));
+      MPI_Bcast(  yBuf, tr->originalCrunchedLength, MPI_UNSIGNED_CHAR,0,MPI_COMM_WORLD); 
+#endif	  
+
+      for(model = 0, globalCounter = 0; model < (size_t)localPr->numberOfPartitions; model++)
+	{
+	  if(tr->manyPartitions)
+	    {
+	      if(isThisMyPartition(localPr, tid, model))
+		{
+		  assert(localPr->partitionData[model]->upper - localPr->partitionData[model]->lower == localPr->partitionData[model]->width);
+		  for(localCounter = 0, i = (size_t)localPr->partitionData[model]->lower;  i < (size_t)localPr->partitionData[model]->upper; i++, localCounter++, globalCounter++)
+#ifdef _USE_PTHREADS
+		    localPr->partitionData[model]->yVector[j][localCounter] = tr->yVector[j][globalCounter];
+#else 
+		  localPr->partitionData[model]->yVector[j][localCounter] = yBuf[globalCounter];
+#endif
+
+
+		}
+	      else
+		globalCounter += (localPr->partitionData[model]->upper - localPr->partitionData[model]->lower);
+	    }
+	  else 
+	    {
+	      for(localCounter = 0, i = (size_t)localPr->partitionData[model]->lower;  i < (size_t)localPr->partitionData[model]->upper; i++, globalCounter++)
+		{
+		  if(i % (size_t)n == (size_t)tid)
+		    {
+#ifdef _USE_PTHREADS
+		      localPr->partitionData[model]->yVector[j][localCounter] = tr->yVector[j][globalCounter];
+#else 
+		      localPr->partitionData[model]->yVector[j][localCounter] = yBuf[globalCounter];
+#endif
+		      ++localCounter; 
+		    }
+		}	   
+	    }
+	}
+    }
+}
+
+/** @brief Distribute the weights in the alignment of slave process/threads
+
+    Allocate space in the local tree structure for the alignment weights. Then
+    copy the weights vector from the master process/thread to the slaves.
+
+    @param tr 
+      PLL instance
+    
+    @param localTree 
+      Local library instance for the current process/thread
+
+    @param localPr
+      Local list of partitions for the current process/thread
+
+    @todo
+      The alignment weights should go to the partitions structure rather than the tree structure
+ */ 
+static void distributeWeights(pllInstance *localTree, pllInstance *tr, partitionList *localPr)
+{
+  int tid = localTree->threadID; 
+  int n = localTree->numberOfThreads; 
+
+  size_t     
+    globalCounter = 0,
+    i,
+    localCounter  = 0,
+    model; 
+
+
+
+  /* distribute the weights  */
+#ifdef _FINE_GRAIN_MPI 		/* need to broadcast a few things first */
+  if(NOT MASTER_P)
+    tr->aliaswgt = rax_malloc(sizeof(int) * tr->originalCrunchedLength); 
+  MPI_Bcast(tr->aliaswgt, tr->originalCrunchedLength, MPI_INT, 0, MPI_COMM_WORLD);      
+#endif
+  for(model = 0, globalCounter = 0; model < (size_t)localPr->numberOfPartitions; model++)
+    { 
+      if(tr->manyPartitions)
+	{
+	  if(isThisMyPartition(localPr, tid, model))
+	    {
+	      assert(localPr->partitionData[model]->upper - localPr->partitionData[model]->lower == localPr->partitionData[model]->width);
+	      for(localCounter = 0, i = (size_t)localPr->partitionData[model]->lower;  i < (size_t)localPr->partitionData[model]->upper; i++, localCounter++, globalCounter++)
+		localPr->partitionData[model]->wgt[localCounter]          = tr->aliaswgt[globalCounter];
+	    }
+	  else
+	    globalCounter += (localPr->partitionData[model]->upper - localPr->partitionData[model]->lower);
+	}
+      else 
+	{ 
+	  for(localCounter = 0, i = (size_t)localPr->partitionData[model]->lower;  i < (size_t)localPr->partitionData[model]->upper; i++, globalCounter++)
+	    {
+	      if(i % (size_t)n == (size_t)tid)
+		localPr->partitionData[model]->wgt[localCounter++]       = tr->aliaswgt[globalCounter];
+	    }	   
+	}
+    }
+}
+
+
+/** @brief Initialize the partitioning scheme (master function) in parallel environment.
+    
+    Initialize the partition scheme in all processes/threads. This is a wrapper function
+    that calls all necessary functions for allocating the local structures for slave threads
+    and for distributing all necessary data from the master threads, such as alignment data,
+    and weight vectors.
+
+    @param tr 
+      PLL instance
+
+    @param localTree 
+      Local PLL instance for the slave process/thread
+
+    @param pr
+      List of partitions
+
+    @param localPr
+      Local partition structure for the slave process/thread
+
+    @param tid
+      Process/thread id
+
+    @param n 
+      Number of processes/threads
+*/ 
+static void initializePartitionsMaster(pllInstance *tr, pllInstance *localTree, partitionList *pr, partitionList *localPr, int tid, int n)
+{ 
+  size_t
+    model;
+
+  treeIsInitialized = PLL_TRUE; 
+
+  ASSIGN_INT(localTree->manyPartitions, tr->manyPartitions);
+  ASSIGN_INT(localTree->numberOfThreads, tr->numberOfThreads);
+  ASSIGN_INT(localPr->numberOfPartitions, pr->numberOfPartitions);
+
+#ifdef _USE_PTHREADS
+  if(MASTER_P)
+    globalResult = rax_calloc((size_t) tr->numberOfThreads * (size_t)pr->numberOfPartitions* 2 ,sizeof(double));
+  else 
+    assignAndInitPart1(localTree, tr, localPr, pr, &tid);
+#else 
+  globalResult = rax_calloc((size_t) tr->numberOfThreads * (size_t)pr->numberOfPartitions* 2 ,sizeof(double));
+  assignAndInitPart1(localTree, tr, localPr, pr, &tid);
+  defineTraversalInfoMPI();
+#endif
+
+  for(model = 0; model < (size_t)localPr->numberOfPartitions; model++)
+    localPr->partitionData[model]->width        = 0;
+
+  if(tr->manyPartitions)    
+    {
+      multiprocessorScheduling(localTree, localPr, tid);
+      computeFractionMany(localPr, tid);
+    }
+  else
+    computeFraction(localPr, tid, n);
+
+  initializePartitionData(localTree, localPr);
+
+  {
+    size_t 
+      model,  
+      i,      
+      countOffset,
+      myLength = 0;
+
+    for(model = 0; model < (size_t)localPr->numberOfPartitions; model++)
+      myLength += localPr->partitionData[model]->width;
+
+    /* assign local memory for storing sequence data */
+    
+    localTree->y_ptr = (unsigned char *)rax_malloc(myLength * (size_t)(localTree->mxtips) * sizeof(unsigned char));
+    assert(localTree->y_ptr != NULL);
+
+    for(i = 0; i < (size_t)localTree->mxtips; i++)
+      {
+	for(model = 0, countOffset = 0; model < (size_t)localPr->numberOfPartitions; model++)
+	  {	    
+	    localPr->partitionData[model]->yVector[i+1]   = &localTree->y_ptr[i * myLength + countOffset];
+	    countOffset +=  localPr->partitionData[model]->width;
+	  }
+	assert(countOffset == myLength);
+      }
+
+    /* figure in data */
+
+    distributeWeights(localTree, tr, localPr);
+
+    distributeYVectors(localTree, tr, localPr);
+
+  }
+
+  initMemorySavingAndRecom(localTree, localPr);
+}
diff --git a/pll/genericParallelization.h b/pll/genericParallelization.h
new file mode 100644
index 0000000..576f8e9
--- /dev/null
+++ b/pll/genericParallelization.h
@@ -0,0 +1,127 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file genericParallelization.h
+ */
+#ifndef _GENERIC_PARALL_H 
+#define _GENERIC_PARALL_H 
+
+
+extern double *globalResult; 
+
+
+/**********/
+/* CONFIG */
+/**********/
+
+/* #define MEASURE_TIME_PARALLEL */
+#define _PORTABLE_PTHREADS
+/* #define DEBUG_PARALLEL */ 
+/* #define DEBUG_MPI_EACH_SEND */
+/* #define _REPRODUCIBLE_MPI_OR_PTHREADS */
+#ifdef _USE_PTHREADS
+#ifndef _PORTABLE_PTHREADS
+void pinToCore(int tid);
+#endif
+#endif
+
+
+#define NOT ! 
+#define IS_PARALLEL (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI)) 
+
+
+
+#ifdef MEASURE_TIME_PARALLEL
+#define NUM_PAR_JOBS 16
+extern double masterTimePerPhase; 
+#endif
+
+
+/******************/
+/* MPI SPECIFIC   */
+/******************/
+#ifdef _FINE_GRAIN_MPI
+#include <mpi.h>
+#ifdef DEBUG_MPI_EACH_SEND
+#define DEBUG_PRINT(text, elem) printf(text, elem)
+#else 
+#define DEBUG_PRINT(text, elem) NULL
+#endif
+
+/* for the broadcast of traversal descriptor */
+#define TRAVERSAL_LENGTH 5
+#define traversalSize sizeof(traversalInfo)
+#define messageSize(x)   (3 * sizeof(int) +  x * (sizeof(int)+ sizeof(double)) + TRAVERSAL_LENGTH * traversalSize)
+
+#define VOLATILE_PAR 
+#define MASTER_P (processID == 0)
+#define POP_OR_PUT_BYTES(bufPtr, elem, type) (MASTER_P ? (bufPtr = addBytes((bufPtr), &(elem), sizeof(type))) : (bufPtr = popBytes((bufPtr), &(elem), sizeof(type))))
+
+#define ASSIGN_INT(x,y) (MPI_Bcast(&y,1,MPI_INT,0,MPI_COMM_WORLD),DEBUG_PRINT("\tSEND/RECV %d\n", y)) 
+#define ASSIGN_BUF(x,y,type) (POP_OR_PUT_BYTES(bufPtr, y,type))
+#define ASSIGN_BUF_DBL(x,y) (POP_OR_PUT_BYTES(bufPtrDbl,y, double))
+#define ASSIGN_DBL(x,y) (MPI_Bcast(&y,1,MPI_DOUBLE, 0, MPI_COMM_WORLD), DEBUG_PRINT("\tSEND/RECV %f\n", y)) 
+#define ASSIGN_DBLS(tar,src,length) MPI_Bcast(tar, length, MPI_DOUBLE, 0, MPI_COMM_WORLD)
+#define PLL_DOUBLE MPI_DOUBLE
+#define ASSIGN_GATHER(tar,src,length,type,tid) MPI_Gather(src,length,type,tar,length,type,0, MPI_COMM_WORLD)
+#define SEND_BUF(buf, bufSize,type) if(MASTER_P) MPI_Bcast(buf, bufSize, type, 0, MPI_COMM_WORLD) 
+#define RECV_BUF(buf, bufSize,type) if(NOT MASTER_P) MPI_Bcast(buf, bufSize, type, 0, MPI_COMM_WORLD) 
+#define BCAST_BUF(buf, bufSize,type,who)  MPI_Bcast(buf, bufSize, type, who,MPI_COMM_WORLD )
+
+
+
+extern int processes; 
+extern int processID; 
+#endif 
+
+/*********************/
+/* PTHREAD SPECIFIC  */
+/*********************/
+#ifdef _USE_PTHREADS
+#if defined (_MSC_VER)
+#include "pthread.h"
+#else
+#include <pthread.h>
+#endif
+#define _REPRODUCIBLE_MPI_OR_PTHREADS
+#define VOLATILE_PAR volatile 
+#define MASTER_P (tid == 0)
+#define ASSIGN_INT(x,y) (x = y)
+#define ASSIGN_BUF(x,y,type) (x = y)
+#define ASSIGN_BUF_DBL(x,y) (x = y)
+#define ASSIGN_DBL(x,y) (x = y)
+#define ASSIGN_DBLS(tar,src,length) memmove(tar, src, length * sizeof(double))
+#define PLL_DOUBLE double 	/* just rededining that to make the source code less confusing */
+#define ASSIGN_GATHER(tar,src,length,type,tid) (memmove((tar) + (tid) * (length) ,src, length * sizeof(type)))
+#define SEND_BUF(buf, bufSize, type) 
+#define RECV_BUF(buf, bufSize, type) 
+#define BCAST_BUF(buf, bufSize,type,who)  
+#define TRAVERSAL_LENGTH 5
+#define messageSize(x) 0
+#endif
+
+
+#endif	/* end include guard  */
diff --git a/pll/globalVariables.h b/pll/globalVariables.h
new file mode 100644
index 0000000..1c76da8
--- /dev/null
+++ b/pll/globalVariables.h
@@ -0,0 +1,170 @@
+/*  RAxML-VI-HPC (version 2.2) a program for sequential and parallel estimation of phylogenetic trees 
+ *  Copyright August 2006 by Alexandros Stamatakis
+ *
+ *  Partially derived from
+ *  fastDNAml, a program for estimation of phylogenetic trees from sequences by Gary J. Olsen
+ *  
+ *  and 
+ *
+ *  Programs of the PHYLIP package by Joe Felsenstein.
+ *
+ *  This program is free software; you may redistribute it and/or modify its
+ *  under the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ * 
+ *
+ *  For any other enquiries send an Email to Alexandros Stamatakis
+ *  Alexandros.Stamatakis at epfl.ch
+ *
+ *  When publishing work that is based on the results from RAxML-VI-HPC please cite:
+ *
+ *  Alexandros Stamatakis:"RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models". 
+ *  Bioinformatics 2006; doi: 10.1093/bioinformatics/btl446
+ */
+
+#ifdef GLOBAL_VARIABLES_DEFINITION
+
+
+const char *protModels[PLL_NUM_PROT_MODELS] = {"DAYHOFF", "DCMUT", "JTT", "MTREV", "WAG", "RTREV", "CPREV", "VT", "BLOSUM62", "MTMAM", "LG", "MTART", "MTZOA", "PMB", 
+					   "HIVB", "HIVW", "JTTDCMUT", "FLU", "AUTO", "LG4M", "LG4X", "GTR"};
+
+const char binaryStateNames[2]   = {'0', '1'};  
+
+const char dnaStateNames[4]      = {'A', 'C', 'G', 'T'};
+
+const char protStateNames[20]    = {'A','R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 
+				    'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 
+				    'Y', 'V'};
+
+const char inverseMeaningBINARY[4] = {'_', '0', '1', '-'};
+const char inverseMeaningDNA[16]   = {'_', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', '-'};
+const char inverseMeaningPROT[23]  = {'A','R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 
+			       'T', 'W', 'Y', 'V', 'B', 'Z', '-'};
+const char inverseMeaningGeneric32[33] = {'0', '1', '2', '3', '4', '5', '6', '7', 
+				    '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+				    'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
+				    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+				    '-'};
+const char inverseMeaningGeneric64[33] = {'0', '1', '2', '3', '4', '5', '6', '7', 
+				    '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+				    'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
+				    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+				    '-'};
+
+const unsigned int bitVectorIdentity[256] = {0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 ,24 ,25 ,26 ,
+					     27 ,28 ,29 ,30 ,31 ,32 ,33 ,34 ,35 ,36 ,37 ,38 ,39 ,40 ,41 ,42 ,43 ,44 ,45 ,46 ,47 ,48 ,49 ,50 ,51 ,
+					     52 ,53 ,54 ,55 ,56 ,57 ,58 ,59 ,60 ,61 ,62 ,63 ,64 ,65 ,66 ,67 ,68 ,69 ,70 ,71 ,72 ,73 ,74 ,75 ,76 ,
+					     77 ,78 ,79 ,80 ,81 ,82 ,83 ,84 ,85 ,86 ,87 ,88 ,89 ,90 ,91 ,92 ,93 ,94 ,95 ,96 ,97 ,98 ,99 ,100 ,101 ,
+					     102 ,103 ,104 ,105 ,106 ,107 ,108 ,109 ,110 ,111 ,112 ,113 ,114 ,115 ,116 ,117 ,118 ,119 ,120 ,121 ,122 ,
+					     123 ,124 ,125 ,126 ,127 ,128 ,129 ,130 ,131 ,132 ,133 ,134 ,135 ,136 ,137 ,138 ,139 ,140 ,141 ,142 ,143 ,
+					     144 ,145 ,146 ,147 ,148 ,149 ,150 ,151 ,152 ,153 ,154 ,155 ,156 ,157 ,158 ,159 ,160 ,161 ,162 ,163 ,164 ,
+					     165 ,166 ,167 ,168 ,169 ,170 ,171 ,172 ,173 ,174 ,175 ,176 ,177 ,178 ,179 ,180 ,181 ,182 ,183 ,184 ,185 ,
+					     186 ,187 ,188 ,189 ,190 ,191 ,192 ,193 ,194 ,195 ,196 ,197 ,198 ,199 ,200 ,201 ,202 ,203 ,204 ,205 ,206 ,
+					     207 ,208 ,209 ,210 ,211 ,212 ,213 ,214 ,215 ,216 ,217 ,218 ,219 ,220 ,221 ,222 ,223 ,224 ,225 ,226 ,227 ,
+					     228 ,229 ,230 ,231 ,232 ,233 ,234 ,235 ,236 ,237 ,238 ,239 ,240 ,241 ,242 ,243 ,244 ,245 ,246 ,247 ,248 ,
+					     249 ,250 ,251 ,252 ,253 ,254 ,255};
+
+
+
+const unsigned int bitVectorAA[23] = {1, 2, 4, 8, 16, 32, 64, 128, 
+				      256, 512, 1024, 2048, 4096, 
+				      8192, 16384, 32768, 65536, 131072, 262144, 
+				      524288, 12 /* N | D */, 96 /*Q | E*/, 1048575 /* - */};
+
+const unsigned int bitVectorSecondary[256] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
+					      10, 11, 12, 13, 14, 15, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 
+					      208, 224, 240, 0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 
+					      255, 0, 256, 512, 768, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816, 3072, 3328, 
+					      3584, 3840, 0, 257, 514, 771, 1028, 1285, 1542, 1799, 2056, 2313, 2570, 2827, 3084, 
+					      3341, 3598, 3855, 0, 272, 544, 816, 1088, 1360, 1632, 1904, 2176, 2448, 2720, 2992, 
+					      3264, 3536, 3808, 4080, 0, 273, 546, 819, 1092, 1365, 1638, 1911, 2184, 2457, 2730, 
+					      3003, 3276, 3549, 3822, 4095, 0, 4096, 8192, 12288, 16384, 20480, 24576, 28672, 32768, 
+					      36864, 40960, 45056, 49152, 53248, 57344, 61440, 0, 4097, 8194, 12291, 16388, 20485, 24582, 
+					      28679, 32776, 36873, 40970, 45067, 49164, 53261, 57358, 61455, 0, 4112, 8224, 12336, 16448, 
+					      20560, 24672, 28784, 32896, 37008, 41120, 45232, 49344, 53456, 57568, 61680, 0, 4113, 8226, 
+					      12339, 16452, 20565, 24678, 28791, 32904, 37017, 41130, 45243, 49356, 53469, 57582, 61695, 
+					      0, 4352, 8704, 13056, 17408, 21760, 26112, 30464, 34816, 39168, 43520, 47872, 52224, 56576, 
+					      60928, 65280, 0, 4353, 8706, 13059, 17412, 21765, 26118, 30471, 34824, 39177, 43530, 47883, 
+					      52236, 56589, 60942, 65295, 0, 4368, 8736, 13104, 17472, 21840, 26208, 30576, 34944, 39312, 
+					      43680, 48048, 52416, 56784, 61152, 65520, 0, 4369, 8738, 13107, 17476, 21845, 26214, 30583, 
+					      34952, 39321, 43690, 48059, 52428, 56797, 61166, 65535};
+
+const unsigned int bitVector32[33] = {1,     2,    4,    8,   16,   32,    64,   128,
+                                      256, 512, 1024, 2048, 4096, 8192, 16384, 32768,
+                                      65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608,
+                                      16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648u, 
+				      4294967295u};
+
+/*const unsigned int bitVector64[65] = {};*/
+/** @brief Array for setting bits 0 .. 31 in a bit vector, used in saveMemory technique for the gapVector */
+const unsigned int mask32[32] = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 
+					262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 
+					268435456, 536870912, 1073741824, 2147483648U};
+
+const char *secondaryModelList[21] = { "S6A (GTR)", "S6B", "S6C", "S6D", "S6E", "S7A (GTR)", "S7B", "S7C", "S7D", "S7E", "S7F", "S16 (GTR)", "S16A", "S16B", "S16C", 
+				       "S16D", "S16E", "S16F", "S16I", "S16J", "S16K"};
+
+const partitionLengths pLengths[PLL_MAX_MODEL] = {
+  
+  /* BINARY */
+  {4,   4,   2,  4,  4, 1, 2,  8, 2, 2, PLL_FALSE, PLL_FALSE, 3, inverseMeaningBINARY, 2, PLL_FALSE, bitVectorIdentity},
+  
+  /* DNA */
+  {16,  16,  4, 16, 16, 6, 4, 64, 6, 4, PLL_FALSE, PLL_FALSE, 15, inverseMeaningDNA, 4, PLL_FALSE, bitVectorIdentity},
+        
+  /* AA */
+  {400, 400, 20, 400, 400, 190, 20, 460, 190, 20, PLL_FALSE, PLL_FALSE, 22, inverseMeaningPROT, 20, PLL_TRUE, bitVectorAA},
+  
+  /* SECONDARY_DATA */
+
+  {256, 256, 16, 256, 256, 120, 16, 4096, 120, 16, PLL_FALSE, PLL_FALSE, 255, (char*)NULL, 16, PLL_TRUE, bitVectorSecondary},
+
+  
+  /* SECONDARY_DATA_6 */
+  {36, 36,  6, 36, 36, 15, 6, 384, 15, 6, PLL_FALSE, PLL_FALSE, 63, (char*)NULL, 6, PLL_TRUE, bitVectorIdentity},
+
+  
+  /* SECONDARY_DATA_7 */
+  {49,   49,    7,   49, 49,  21, 7, 896, 21, 7, PLL_FALSE, PLL_FALSE, 127, (char*)NULL, 7, PLL_TRUE, bitVectorIdentity},
+
+  /* 32 states */
+  {1024, 1024, 32, 1024, 1024, 496, 32, 1056, 496, 32, PLL_FALSE, PLL_FALSE, 32, inverseMeaningGeneric32, 32, PLL_TRUE, bitVector32},
+  
+  /* 64 states */
+  {4096, 4096, 64, 4096, 4096, 2016, 64, 4160, 64, 2016, PLL_FALSE, PLL_FALSE, 64, (char*)NULL, 64, PLL_TRUE, (unsigned int*)NULL}
+};
+
+
+#if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI))
+double *globalResult;
+pllBoolean treeIsInitialized;
+#ifdef MEASURE_TIME_PARALLEL
+double masterTimePerPhase; 
+#endif
+#endif
+
+#ifdef _USE_PTHREADS
+volatile int             jobCycle = 0;
+volatile int             threadJob = 0;
+volatile char            *barrierBuffer;
+#endif
+
+#ifdef _FINE_GRAIN_MPI
+int processes;
+int processID; 
+MPI_Datatype TRAVERSAL_MPI; 
+#endif
+
+#else
+extern const partitionLengths pLengths[PLL_MAX_MODEL];
+extern const char * protModels[PLL_NUM_PROT_MODELS];
+extern char * secondaryModelList[21];
+//extern const unsigned int * mask32;
+
+#endif
diff --git a/pll/hardware.c b/pll/hardware.c
new file mode 100644
index 0000000..3607568
--- /dev/null
+++ b/pll/hardware.c
@@ -0,0 +1,165 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#include <string.h>
+#include "hardware.h"
+
+#define PLL_FEAT_AVAIL(x,y) (((x) & (y)) == (y))
+#define PLL_SYS_CPU_DIR_PATH "/sys/devices/system/cpu/"
+
+//#ifdef _MSC_VER
+//#define inline __inline
+//#endif
+
+static __inline void cpuid(unsigned int op, int count,
+                         unsigned int *eax, unsigned int *ebx,
+                         unsigned int *ecx, unsigned int *edx)
+{
+#ifdef WIN32
+	__int32 regs[4];
+	__cpuid((int*)regs, (int)op);
+	*eax = regs[0];
+	*ebx = regs[1];
+	*ecx = regs[2];
+	*edx = regs[3];
+#else
+	*eax = op;
+  *ecx = count;
+  asm volatile("cpuid"
+        : "=a" (*eax),
+          "=b" (*ebx),
+          "=c" (*ecx),
+          "=d" (*edx)
+
+        : "0" (*eax), "2" (*ecx)
+        : "memory");
+#endif
+}
+
+
+void show_hardware_info(pllHardwareInfo * hw)
+{
+  printf ("MMX.........: %d\n"
+          "SSE.........: %d\n"
+          "SSE2........: %d\n"
+          "SSE3........: %d\n"
+          "SSSE3.......: %d\n"
+          "FMA.........: %d\n"
+          "SSE4.1......: %d\n"
+          "SSE4.2......: %d\n"
+          "AVX.........: %d\n"
+          "AVX2........: %d\n"
+          "SSE4A.......: %d\n"
+          "FMA4........: %d\n\n"
+          "Core(s).....: %d\n"
+          "CPU Sockets.: %d\n",
+
+          hw->has_mmx, hw->has_sse, hw->has_sse2, hw->has_sse3, hw->has_ssse3,
+          hw->has_fma, hw->has_sse41, hw->has_sse42, hw->has_avx, hw->has_avx2,
+          hw->has_sse4a, hw->has_fma4, hw->cores, hw->cpu_sockets);
+}
+
+static int pll_probe_cpu (pllHardwareInfo * hw)
+{
+  struct stat cpustat;
+  char cpu[30];
+  char cpupath[100];
+  int i, id, max_physical_id = -1;
+  char * physical_id_path = "/topology/physical_package_id";
+  FILE * fd;
+
+  /* check whether the sys cpu dir exists */
+  if (stat(PLL_SYS_CPU_DIR_PATH, &cpustat)) return (0);
+  
+  /* and also check whether it is a dir */
+  if (!S_ISDIR(cpustat.st_mode)) return (0);
+
+  /* detect number of processors */
+  for (i = 0; ; ++i)
+   {
+     sprintf(cpu, "cpu%d", i);
+     strcpy (cpupath, PLL_SYS_CPU_DIR_PATH);
+     strcat (cpupath, cpu);
+     if (stat(cpupath, &cpustat)) break;
+
+     strcat (cpupath, physical_id_path);
+     if (!stat(cpupath, &cpustat))
+      {
+        fd = fopen (cpupath,"r");
+        fscanf (fd, "%d", &id);
+        /* printf ("Detected processor %d belonging to package %d\n", i, id); */
+        if (id > max_physical_id) max_physical_id = id;
+        fclose (fd);
+      }
+   }
+  
+  hw->cores       = i;
+  hw->cpu_sockets = max_physical_id + 1;
+
+  return (1);
+}
+
+static void pll_probe_hardware (pllHardwareInfo * hw)
+{
+  unsigned int a, b, c, d;
+  c = 0;
+
+  cpuid(0,0,&a,&b,&c,&d);
+  *((unsigned int *)(hw->vendor)    ) = b;
+  *((unsigned int *)(hw->vendor + 4)) = d;
+  *((unsigned int *)(hw->vendor + 8)) = c;
+  hw->vendor[12] = 0;
+
+  printf ("%s\n", hw->vendor);
+
+  cpuid(1,0,&a,&b,&c,&d);
+
+  hw->has_mmx   = PLL_FEAT_AVAIL(d,PLL_HAS_MMX); 
+  hw->has_sse   = PLL_FEAT_AVAIL(d,PLL_HAS_SSE);
+  hw->has_sse2  = PLL_FEAT_AVAIL(d,PLL_HAS_SSE2);
+
+  hw->has_sse3  = PLL_FEAT_AVAIL(c,PLL_HAS_SSE3);
+  hw->has_ssse3 = PLL_FEAT_AVAIL(c,PLL_HAS_SSSE3);
+  hw->has_fma   = PLL_FEAT_AVAIL(c,PLL_HAS_FMA);
+  hw->has_sse41 = PLL_FEAT_AVAIL(c,PLL_HAS_SSE41);
+  hw->has_sse42 = PLL_FEAT_AVAIL(c,PLL_HAS_SSE42);
+  hw->has_avx   = PLL_FEAT_AVAIL(c,PLL_HAS_AVX);
+
+  cpuid(7,0,&a,&b,&c,&d);
+
+  hw->has_avx2  = PLL_FEAT_AVAIL(b,PLL_HAS_AVX2);
+
+  /* TODO: note, here we have to check whether leaf 0x80000001 exists */
+  cpuid(0x80000001,0,&a,&b,&c,&d);
+
+  hw->has_sse4a = PLL_FEAT_AVAIL(c,PLL_HAS_SSE4A);
+  hw->has_fma4  = PLL_FEAT_AVAIL(c,PLL_HAS_FMA4);
+}
+
+int pllGetHardwareInfo (pllHardwareInfo * hw)
+{
+  pll_probe_hardware (hw);
+  pll_probe_cpu (hw);
+
+  /* TODO: finish failure checks in probe_hardware and probe_cpu */
+  return (1);
+
+}
+
+/* TODO: Remove after testing */
+/* 
+int main (int argc, char * argv[])
+{ 
+  pllHardwareInfo hw;
+
+  pll_probe_hardware(&hw);
+  pll_probe_cpu(&hw);
+
+  show_hardware_info(&hw);
+  return (EXIT_SUCCESS);
+}
+*/
diff --git a/pll/hardware.h b/pll/hardware.h
new file mode 100644
index 0000000..d1bfa33
--- /dev/null
+++ b/pll/hardware.h
@@ -0,0 +1,48 @@
+#ifndef PLL_HARDWARE
+#define PLL_HARDWARE
+
+/* leaf 1 */
+/* edx */
+#define PLL_HAS_MMX             1 << 23
+#define PLL_HAS_SSE             1 << 25
+#define PLL_HAS_SSE2            1 << 26
+
+/* ecx */
+#define PLL_HAS_SSE3            1
+#define PLL_HAS_SSSE3           1 <<  9
+#define PLL_HAS_FMA             1 << 12
+#define PLL_HAS_SSE41           1 << 19
+#define PLL_HAS_SSE42           1 << 20
+#define PLL_HAS_AVX             1 << 28
+
+
+/* leaf 7 */
+/* ebx */
+#define PLL_HAS_AVX2            1 <<  5
+
+/* leaf 0x80000001 */
+/* ecx*/
+#define PLL_HAS_SSE4A           1 <<  6
+#define PLL_HAS_FMA4            1 << 16
+
+typedef struct
+{
+  int has_mmx;
+  int has_sse;
+  int has_sse2;
+  int has_sse3;
+  int has_ssse3;
+  int has_sse41;
+  int has_sse42;
+  int has_sse4a;
+  int has_avx;
+  int has_avx2;
+  int has_fma;
+  int has_fma4;
+  int cpu_sockets;
+  int cores;
+  char vendor[13];
+
+} pllHardwareInfo;
+
+#endif
diff --git a/pll/hash.c b/pll/hash.c
new file mode 100644
index 0000000..4a68225
--- /dev/null
+++ b/pll/hash.c
@@ -0,0 +1,219 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file hash.c
+ */
+#include <stdio.h>
+#include <string.h>
+#include "pll.h"
+#include "mem_alloc.h"
+
+static const unsigned int initTable[] = 
+  {
+    53,         97,         193,       389,       769,    
+    1543,       3079,       6151,      12289,     24593, 
+    49157,      98317,      196613,    393241,    786433, 
+    1572869,    3145739,    6291469,   12582917,  25165843, 
+    50331653,   100663319,  201326611, 402653189, 805306457, 
+    1610612741, 3221225473, 4294967291
+  };
+       
+/** @brief Generate the hash value for a string 
+
+    Generates the hash value of a string \a s.
+
+    @param s     The string to compute the hash for
+    @param size  Size of the hash table
+    @return      String hash \a s, i.e. index in hash table
+*/
+unsigned int pllHashString (const char * s, unsigned int size)
+{
+  unsigned int hash = 0;
+
+  for (; *s; ++s) hash = (hash << 5) - hash + (unsigned int )*s;
+
+  return (hash % size);
+}
+
+/** @brief Add a string and its data to a hashtable
+    
+    Add an \a item and possibly a string \a s to hashtable \a hTable at position
+    \a hash, where \a hash must be a value between 0 and \a hTable->size - 1. If
+    string \a s is given and another record with the same computed hash and the
+    same associated string exists in the hash table, then the new record will \b not be added and the
+    value \b PLL_FALSE is returned. Otherwise, the new item is added at the
+    beginning of the corresponding linked list and the value \b PLL_TRUE is
+    returned.
+
+    @param hTable Hashtable
+    @param hash   Position where to store in hash table
+    @param s      String
+    @param item   Data associated with \a s
+    @return       Returns \b PLL_TRUE if added with success, otherwise \b PLL_FALSE
+*/
+int pllHashAdd  (pllHashTable * hTable, unsigned int hash, const char * s, void * item)
+{
+  pllHashItem * hItem;
+
+  hItem = hTable->Items[hash];
+
+  /* If a string was given, check whether the record already exists */
+  if (s)
+   {
+     for (; hItem; hItem = hItem->next)
+      {
+        if (hItem->str && !strcmp (s, hItem->str)) return (PLL_FALSE);
+      }
+   }
+
+  hItem = (pllHashItem *) rax_malloc (sizeof (pllHashItem));
+
+  /* store the string together with the element if given */
+  if (s)
+   {
+     hItem->str = (char *) rax_malloc ((strlen(s) + 1) * sizeof (char));
+     strcpy (hItem->str, s);
+   }
+  else
+   hItem->str = NULL;
+
+  hItem->data = item;
+
+  hItem->next = hTable->Items[hash];
+  hTable->Items[hash] = hItem;
+  hTable->entries += 1;
+
+  return (PLL_TRUE);
+}
+
+       
+/** @brief Initialize hash table
+    
+    Create a hash table of size at least \a n. The size of the hash table will
+    be the first prime number higher or equal to \a n.
+
+    @param n  Minimum size of hash table
+    @return   In case of success, returns a pointer to the created hash table, otherwise returns \b NULL
+*/
+pllHashTable * pllHashInit (unsigned int n)
+{ 
+  pllHashTable * hTable;
+  unsigned int i;
+  unsigned int primeTableLength;
+       
+  hTable = (pllHashTable *) rax_malloc (sizeof (pllHashTable));
+  if (!hTable) return (NULL);
+  
+  primeTableLength = sizeof (initTable) / sizeof(initTable[0]);
+
+  i = 0;
+ 
+  while (initTable[i] < n && i < primeTableLength) ++ i;
+ 
+  n = initTable[i];  
+ 
+  hTable->Items = (pllHashItem **) rax_calloc (n, sizeof (pllHashItem *));
+  if (!hTable->Items)
+   {
+     rax_free (hTable);
+     return (NULL);
+   }
+  hTable->size    = n;
+  hTable->entries = 0;
+ 
+  return (hTable);
+}
+
+/** @brief Retrieve the data stored in hash table for a given string
+
+    Retrieve the data stored in hash table \a hTable under a given string \a s.
+    In case the string is found in the hash table, the associated data are
+    stored in \a item and the function returns \b PLL_TRUE. In the opposite
+    case, or if \a s is given as \b NULL then \b PLL_FALSE is returned.
+
+    @param hTable   Hash table to be searched
+    @param s        String to look for
+    @param item     Where to store the retrieved data
+    @return         Returns \b PLL_TRUE if the string was found, otherwise \b PLL_FALSE
+*/
+int pllHashSearch (pllHashTable * hTable, char * s, void ** item)
+{
+  unsigned int pos;
+  pllHashItem * hItem;
+
+  if (!s) return (PLL_FALSE);
+
+  pos   = pllHashString (s, hTable->size);
+  hItem = hTable->Items[pos];
+
+  for (; hItem; hItem = hItem->next)
+   {
+     if (hItem->str && !strcmp (s, hItem->str))
+      {
+        *item = hItem->data;
+        return (PLL_TRUE);
+      }
+   }
+
+  return (PLL_FALSE);
+}
+
+/** @brief Deallocate a hash table
+
+    Deallocates the hash table. A callback function may be specified as \a
+    cbDealloc which will be executed upon all \a data elements of the hash
+    table, for deallocating custom data. If no deallocation is required for the
+    custom data, then \a cbDealloc must be set to \b NULL. The strings
+    associated with each hash element are deallocated.
+
+    @param hTable    Hash table to be deallocated
+    @pram  cbDealloc Callback function to perform deallocation of each data element of the hash table
+    @notes
+      Deallocates the structure for the hash table. Note that the 
+      data associated with the indexed strings are not deallocated.
+*/
+void pllHashDestroy (pllHashTable ** hTable, void (*cbDealloc)(void *))
+{
+  unsigned int i;
+  pllHashItem * hItem;
+  pllHashItem * tmp;
+
+  for (i = 0; i < (*hTable)->size; ++ i)
+  {
+    hItem = (*hTable)->Items[i];
+    while (hItem)
+     {
+       tmp   = hItem;
+       hItem = hItem->next;
+       if (tmp->str)  rax_free (tmp->str);
+       if (cbDealloc) cbDealloc (tmp->data);
+       rax_free (tmp);
+     }
+  }
+  rax_free ((*hTable)->Items);
+  rax_free (*hTable);
+  *hTable = NULL;
+}
diff --git a/pll/hash.h b/pll/hash.h
new file mode 100644
index 0000000..a550f38
--- /dev/null
+++ b/pll/hash.h
@@ -0,0 +1,50 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file hash.h
+ */
+#ifndef __pll_HASH__
+#define __pll_HASH__
+
+struct pllHashItem
+{
+  void * data;
+  char * str;
+  struct pllHashItem * next;
+};
+
+struct pllHashTable
+{
+  unsigned int size;
+  struct pllHashItem ** Items;
+};
+
+unsigned int pllHashString (const char * s, unsigned int size);
+int pllHashAdd  (struct pllHashTable * hTable, const char * s, void * item);
+struct pllHashTable * pllHashInit (unsigned int n);
+int pllHashSearch (struct pllHashTable * hTable, char * s, void ** item);
+void pllHashDestroy (struct pllHashTable ** hTable, int);
+#endif
diff --git a/pll/lexer.c b/pll/lexer.c
new file mode 100644
index 0000000..1cbf614
--- /dev/null
+++ b/pll/lexer.c
@@ -0,0 +1,299 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file lexer.c
+ */
+#include <stdio.h>
+#include "lexer.h"
+
+static const char * rawtext;
+static long rawtext_size;
+static long pos = 0;
+
+int lex_table[PLL_ASCII_SIZE] = {
+/*      */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/*      */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/*      */ PLL_SYM_UNKNOWN,     PLL_SYM_TAB,      PLL_SYM_CR,   PLL_SYM_UNKNOWN,
+/*      */ PLL_SYM_UNKNOWN,      PLL_SYM_LF, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/*      */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/*      */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/*      */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/*      */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/*  !"# */   PLL_SYM_SPACE, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/* $%&' */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/* ()*+ */  PLL_SYM_OPAREN,  PLL_SYM_CPAREN, PLL_SYM_UNKNOWN,      PLL_SYM_PLUS,
+/* ,-./ */   PLL_SYM_COMMA,    PLL_SYM_DASH,     PLL_SYM_DOT,     PLL_SYM_SLASH,
+/* 0123 */   PLL_SYM_DIGIT,   PLL_SYM_DIGIT,   PLL_SYM_DIGIT,     PLL_SYM_DIGIT,
+/* 4567 */   PLL_SYM_DIGIT,   PLL_SYM_DIGIT,   PLL_SYM_DIGIT,     PLL_SYM_DIGIT,
+/* 89:; */   PLL_SYM_DIGIT,   PLL_SYM_DIGIT,   PLL_SYM_COLON, PLL_SYM_SEMICOLON,
+/* <=>? */ PLL_SYM_UNKNOWN,   PLL_SYM_EQUAL, PLL_SYM_UNKNOWN,      PLL_SYM_CHAR,
+/* @ABC */ PLL_SYM_UNKNOWN,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* DEFG */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* HIJK */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* LMNO */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* PQRS */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* TUVW */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* XYZ[ */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,   PLL_SYM_UNKNOWN,
+/* \]^_ */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,      PLL_SYM_CHAR,
+/* `abc */ PLL_SYM_UNKNOWN,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* defg */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* hijk */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* lmno */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* pqrs */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* tuvw */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* xyz{ */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,   PLL_SYM_UNKNOWN,
+/* |}~  */    PLL_SYM_CHAR, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN
+ };
+
+int 
+get_next_byte (void)
+{
+  if (pos == rawtext_size) 
+   {
+     ++pos;
+     return (PLL_EOS);
+   }
+
+  return (rawtext[pos++]);
+}
+
+int
+get_next_symbol (void)
+{
+  int ch, sym;
+
+  ch = get_next_byte ();
+
+  if (ch == PLL_EOS) return (PLL_SYM_EOF);
+  if (ch >= PLL_ASCII_SIZE) return (PLL_SYM_UNKNOWN);
+
+  sym = lex_table[ch];
+
+  if (sym == PLL_SYM_LF)
+   {
+     if (get_next_byte() == '\n')
+      {
+        sym = PLL_SYM_LFCR;
+      }
+     else
+      {
+        --pos;
+      }
+   }
+
+  return sym;
+}
+
+pllLexToken
+get_token (int * input)
+{
+  pllLexToken token;
+  int
+    start_pos,
+    isFloating = 0;
+
+  token.lexeme = rawtext + pos - 1;
+  start_pos    = pos;
+
+  switch (*input)
+   {
+     case PLL_SYM_SLASH:
+       token.tokenType = PLL_TOKEN_SLASH;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_DASH:
+       token.tokenType = PLL_TOKEN_DASH;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_EQUAL:
+       token.tokenType = PLL_TOKEN_EQUAL;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_SEMICOLON:
+       token.tokenType = PLL_TOKEN_SEMICOLON;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_COMMA:
+       token.tokenType = PLL_TOKEN_COMMA;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_COLON:
+       token.tokenType = PLL_TOKEN_COLON;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_OPAREN:
+       token.tokenType = PLL_TOKEN_OPAREN;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_CPAREN:
+       token.tokenType = PLL_TOKEN_CPAREN;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_SPACE:
+     case PLL_SYM_TAB:
+       do
+        {
+          *input = get_next_symbol();
+        } while (*input == PLL_SYM_SPACE || *input == PLL_SYM_TAB);
+       token.len   = pos - start_pos;
+       token.tokenType = PLL_TOKEN_WHITESPACE; 
+       if (*input == PLL_SYM_LFCR) --token.len;
+       break;
+       
+     case PLL_SYM_DIGIT:
+       do
+        {
+          *input = get_next_symbol();   
+        } while (*input == PLL_SYM_DIGIT);
+
+       if (*input == PLL_SYM_DOT)
+        {
+          isFloating = 1;
+          do
+           {
+             *input = get_next_symbol ();
+           } while (*input == PLL_SYM_DIGIT);
+        }
+
+       if (*input != PLL_SYM_CHAR)
+        {
+          token.len   = pos - start_pos;
+          if (!isFloating)
+            token.tokenType = PLL_TOKEN_NUMBER;
+          else
+            token.tokenType = PLL_TOKEN_FLOAT;
+        }
+       else
+        {
+          /* check for E notation */
+          if (rawtext[pos - 1] == 'E' || rawtext[pos - 1] == 'e')
+           {
+             *input = get_next_symbol ();
+
+             if (*input == PLL_SYM_PLUS || *input == PLL_SYM_DASH || *input == PLL_SYM_DIGIT)
+              {
+                do
+                 {
+                   *input = get_next_symbol ();
+                 } while (*input == PLL_SYM_DIGIT);
+
+                if (*input != PLL_SYM_CHAR)
+                 {
+                   token.len = pos - start_pos;
+                   token.tokenType = PLL_TOKEN_FLOAT;
+                 }
+              }
+             else
+              {
+                token.len = pos - start_pos;
+                token.tokenType = PLL_TOKEN_STRING;
+              }
+           }
+
+          if (*input == PLL_SYM_CHAR)
+           {
+             do {
+               *input = get_next_symbol();
+             } while (*input == PLL_SYM_CHAR || *input == PLL_SYM_DIGIT || *input == PLL_SYM_DOT);
+             token.len   = pos - start_pos;
+             token.tokenType = PLL_TOKEN_STRING;
+           }
+        }
+
+       if (*input == PLL_SYM_LFCR) --token.len;
+       break;
+
+     case PLL_SYM_CHAR:
+       do
+        {
+          *input = get_next_symbol();
+        } 
+       while (*input == PLL_SYM_CHAR  || 
+              *input == PLL_SYM_DIGIT || 
+              *input == PLL_SYM_DASH  ||
+              *input == PLL_SYM_DOT);
+       token.len   = pos - start_pos;
+       token.tokenType = PLL_TOKEN_STRING;
+       if (*input == PLL_SYM_LFCR) --token.len;
+       break;
+       
+     case PLL_SYM_EOF:
+       token.tokenType = PLL_TOKEN_EOF;
+       break;
+
+     case PLL_SYM_CR:
+     case PLL_SYM_LF:
+     case PLL_SYM_LFCR:
+       do
+        {
+          *input = get_next_symbol();
+        } while (*input == PLL_SYM_CR || *input == PLL_SYM_LFCR || *input == PLL_SYM_LF);
+       token.tokenType = PLL_TOKEN_NEWLINE;
+       break;
+     case PLL_SYM_UNKNOWN:
+     default:
+       token.tokenType = PLL_TOKEN_UNKNOWN;
+       break;
+   }
+
+  return (token);
+}
+
+void
+lex_table_amend_phylip (void)
+{
+  lex_table['-'] = lex_table['.'] = PLL_SYM_CHAR; 
+}
+
+void
+lex_table_amend_fasta (void)
+{
+  lex_table['-'] = lex_table['.'] = lex_table['>'] = PLL_SYM_CHAR; 
+}
+
+void
+lex_table_restore (void)
+{
+  lex_table['-'] = PLL_SYM_DASH;
+  lex_table['.'] = PLL_SYM_DOT; 
+  lex_table['>'] = PLL_SYM_UNKNOWN;
+}
+
+void
+init_lexan (const char * text, long n)
+{
+  rawtext      = text;
+  rawtext_size = n;
+  pos          = 0;
+}
diff --git a/pll/lexer.h b/pll/lexer.h
new file mode 100644
index 0000000..6924259
--- /dev/null
+++ b/pll/lexer.h
@@ -0,0 +1,88 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file lexer.h
+ */
+#ifndef __pll_LEXER__
+#define __pll_LEXER__
+
+#define  PLL_ASCII_SIZE                128
+#define  PLL_EOS                       0x00000200
+
+#define  PLL_SYM_CR                    1 << 0
+#define  PLL_SYM_LF                    1 << 1
+#define  PLL_SYM_LFCR                  1 << 2
+#define  PLL_SYM_DIGIT                 1 << 3
+#define  PLL_SYM_CHAR                  1 << 4
+#define  PLL_SYM_SPACE                 1 << 5
+#define  PLL_SYM_TAB                   1 << 6
+#define  PLL_SYM_EOF                   1 << 7
+#define  PLL_SYM_UNKNOWN               1 << 8
+#define  PLL_SYM_DOT                   1 << 9
+#define  PLL_SYM_COLON                 1 << 10
+#define  PLL_SYM_OPAREN                1 << 11
+#define  PLL_SYM_CPAREN                1 << 12
+#define  PLL_SYM_COMMA                 1 << 13
+#define  PLL_SYM_SEMICOLON             1 << 14
+#define  PLL_SYM_EQUAL                 1 << 15
+#define  PLL_SYM_DASH                  1 << 16
+#define  PLL_SYM_SLASH                 1 << 17
+#define  PLL_SYM_PLUS                  1 << 18
+
+#define  PLL_TOKEN_NUMBER              1 << 0
+#define  PLL_TOKEN_STRING              1 << 1
+#define  PLL_TOKEN_EOF                 1 << 2
+#define  PLL_TOKEN_WHITESPACE          1 << 3
+#define  PLL_TOKEN_NEWLINE             1 << 4
+#define  PLL_TOKEN_UNKNOWN             1 << 5
+#define  PLL_TOKEN_COLON               1 << 6
+#define  PLL_TOKEN_OPAREN              1 << 7
+#define  PLL_TOKEN_CPAREN              1 << 8
+#define  PLL_TOKEN_FLOAT               1 << 9
+#define  PLL_TOKEN_COMMA               1 << 10
+#define  PLL_TOKEN_SEMICOLON           1 << 11
+#define  PLL_TOKEN_EQUAL               1 << 12
+#define  PLL_TOKEN_DASH                1 << 13
+#define  PLL_TOKEN_SLASH               1 << 14
+
+#define CONSUME(x)         while (token.tokenType & (x)) token = get_token (&input);
+#define NEXT_TOKEN         token = get_token (&input);
+
+typedef struct
+ {
+   int 	        tokenType;
+   const char * lexeme;
+   int          len;
+ } pllLexToken;
+
+int get_next_byte (void);
+int get_next_symbol (void);
+pllLexToken get_token (int * input);
+void init_lexan (const char * text, long n);
+void lex_table_amend_phylip (void);
+void lex_table_amend_fasta (void);
+void lex_table_restore (void);
+#endif
diff --git a/pll/makenewzGenericSpecial.c b/pll/makenewzGenericSpecial.c
new file mode 100644
index 0000000..b2b114a
--- /dev/null
+++ b/pll/makenewzGenericSpecial.c
@@ -0,0 +1,3145 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file bipartitionList.c
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+#ifdef __SSE3
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+/*#include <tmmintrin.h>*/
+#endif
+
+#ifdef __MIC_NATIVE
+#include "mic_native.h"
+#endif
+
+
+/** @file makenewzGenericSpecial.c
+ *  
+ *  @brief Branch length optimization
+ */
+
+
+
+/* pointers to reduction buffers for storing and gathering the first and second derivative 
+   of the likelihood in Pthreads and MPI */
+
+#if IS_PARALLEL
+void branchLength_parallelReduce(pllInstance *tr, double *dlnLdlz,  double *d2lnLdlz2, int numBranches ) ;
+//extern double *globalResult;
+#endif
+
+
+extern const unsigned int mask32[32];
+
+#if (defined(__SSE3) || defined(__AVX))
+static void sumGAMMA_BINARY(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+                            unsigned char *tipX1, unsigned char *tipX2, int n);
+static void coreGTRGAMMA_BINARY(const int upper, double *sumtable,
+                                volatile double *d1,   volatile double *d2, double *EIGN, double *gammaRates, double lz, int *wrptr);
+static void coreGTRCAT_BINARY(int upper, int numberOfCategories, double *sum,
+                              volatile double *d1, volatile double *d2, 
+                              double *rptr, double *EIGN, int *cptr, double lz, int *wgt);
+static void sumCAT_BINARY(int tipCase, double *sum, double *x1_start, double *x2_start, double *tipVector,
+                          unsigned char *tipX1, unsigned char *tipX2, int n);
+#endif
+
+/*******************/
+
+
+/* generic function to get the required pointers to the data associated with the left and right node that define a branch */
+
+static void getVects(pllInstance *tr, 
+                     partitionList *pr, 
+                     unsigned char **tipX1, unsigned char **tipX2, 
+                     double **x1_start, double **x2_start, 
+                     int *tipCase, 
+                     int model, 
+                     double **x1_gapColumn, double **x2_gapColumn, 
+                     unsigned int **x1_gap, unsigned int **x2_gap,
+                     double ** x1_start_asc,
+                     double ** x2_start_asc)
+{
+  int    
+    rateHet = (int)discreteRateCategories(tr->rateHetModel),
+            states = pr->partitionData[model]->states,
+            pNumber, 
+            qNumber; 
+
+  /* get the left and right node number of the nodes defining the branch we want to optimize */
+
+  pNumber = tr->td[0].ti[0].pNumber;
+  qNumber = tr->td[0].ti[0].qNumber;
+
+  /* get the index where the ancestral vector is expected to be found */
+  int p_slot, q_slot;
+  if(tr->useRecom)
+  {
+    p_slot = tr->td[0].ti[0].slot_p; 
+    q_slot = tr->td[0].ti[0].slot_q;
+  }
+  else
+  {
+    p_slot = pNumber - tr->mxtips - 1;
+    q_slot = qNumber - tr->mxtips - 1;
+  }
+   
+
+  /* initialize to NULL */
+
+  *x1_start = (double*)NULL,
+  *x2_start = (double*)NULL;
+  
+  *tipX1 = (unsigned char*)NULL,
+  *tipX2 = (unsigned char*)NULL;
+
+  *x1_start_asc = NULL;
+  *x2_start_asc = NULL;
+
+  /* switch over the different tip cases again here */
+
+  if(isTip(pNumber, tr->mxtips) || isTip(qNumber, tr->mxtips))
+  {      
+    if(!( isTip(pNumber, tr->mxtips) && isTip(qNumber, tr->mxtips)) )
+    {
+      *tipCase = PLL_TIP_INNER;
+      if(isTip(qNumber, tr->mxtips))
+      {
+        *tipX1 = pr->partitionData[model]->yVector[qNumber];
+        *x2_start = pr->partitionData[model]->xVector[p_slot];
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+        if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+          if(pr->partitionData[model]->ascBias)
+#endif
+          {
+            *x2_start_asc = &pr->partitionData[model]->ascVector[(pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+          }
+
+        if(tr->saveMemory)
+        {
+          *x2_gap = &(pr->partitionData[model]->gapVector[pNumber * pr->partitionData[model]->gapVectorLength]);
+          *x2_gapColumn   = &pr->partitionData[model]->gapColumn[(pNumber - tr->mxtips - 1) * states * rateHet];
+        }
+      }
+      else
+      {
+        *tipX1 = pr->partitionData[model]->yVector[pNumber];
+        *x2_start = pr->partitionData[model]->xVector[q_slot];
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+        if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+          if(pr->partitionData[model]->ascBias)
+#endif  
+          {
+            *x2_start_asc = &pr->partitionData[model]->ascVector[(qNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+          }
+
+        if(tr->saveMemory)
+        {
+          *x2_gap = &(pr->partitionData[model]->gapVector[qNumber * pr->partitionData[model]->gapVectorLength]);
+          *x2_gapColumn   = &pr->partitionData[model]->gapColumn[(qNumber - tr->mxtips - 1) * states * rateHet];
+        }
+      }
+    }
+    else
+    {
+      /* note that tip tip should normally not occur since this means that we are trying to optimize 
+         a branch in a two-taxon tree. However, this has been inherited be some RAxML function 
+         that optimized pair-wise distances between all taxa in a tree */
+
+      *tipCase = PLL_TIP_TIP;
+      *tipX1 = pr->partitionData[model]->yVector[pNumber];
+      *tipX2 = pr->partitionData[model]->yVector[qNumber];
+    }
+  }
+  else
+  {
+    *tipCase = PLL_INNER_INNER;
+
+    *x1_start = pr->partitionData[model]->xVector[p_slot];
+    *x2_start = pr->partitionData[model]->xVector[q_slot];
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+      if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+        if(pr->partitionData[model]->ascBias)
+#endif
+        {
+          *x1_start_asc = &pr->partitionData[model]->ascVector[(pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+          *x2_start_asc = &pr->partitionData[model]->ascVector[(qNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+        }           
+    if(tr->saveMemory)
+    {
+      *x1_gap = &(pr->partitionData[model]->gapVector[pNumber * pr->partitionData[model]->gapVectorLength]);
+      *x1_gapColumn   = &pr->partitionData[model]->gapColumn[(pNumber - tr->mxtips - 1) * states * rateHet];
+
+      *x2_gap = &(pr->partitionData[model]->gapVector[qNumber * pr->partitionData[model]->gapVectorLength]);
+      *x2_gapColumn   = &pr->partitionData[model]->gapColumn[(qNumber - tr->mxtips - 1) * states * rateHet];
+    }
+  }
+
+}
+
+
+/* this is actually a pre-computation and storage of values that remain constant while we change the value of the branch length 
+   we want to adapt. the target pointer sumtable is a single pre-allocated array that has the same 
+   size as a conditional likelihood vector at an inner node.
+
+   So if we want to do a Newton-Raphson optimization we only execute this function once in the beginning for each new branch we are considering !
+   */
+
+#if (!defined(__SSE3) && !defined(__AVX))
+static void sumCAT_FLEX(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, const int states)
+{
+  int 
+    i, 
+    l;
+
+  double 
+    *sum, 
+    *left, 
+    *right;
+
+  switch(tipCase)
+  {
+
+    /* switch over possible configurations of the nodes p and q defining the branch */
+
+    case PLL_TIP_TIP:
+      for (i = 0; i < n; i++)
+      {
+        left  = &(tipVector[states * tipX1[i]]);
+        right = &(tipVector[states * tipX2[i]]);
+        sum = &sumtable[states * i];
+
+        /* just multiply the values with each other for each site, note the similarity with evaluate() 
+           we precompute the product which will remain constant and then just multiply this pre-computed 
+           product with the changing P matrix exponentaions that depend on the branch lengths */
+
+        for(l = 0; l < states; l++)
+          sum[l] = left[l] * right[l];
+      }
+      break;
+    case PLL_TIP_INNER:
+
+      /* same as for PLL_TIP_TIP only that 
+         we now access on tip vector and one 
+         inner vector. 
+
+         You may also observe that we do not consider using scaling vectors anywhere here.
+
+         This is because we are interested in the first and second derivatives of the likelihood and 
+         hence the addition of the log() of the scaling factor times the number of scaling events
+         becomes obsolete through the derivative */
+
+      for (i = 0; i < n; i++)
+      {
+        left = &(tipVector[states * tipX1[i]]);
+        right = &x2[states * i];
+        sum = &sumtable[states * i];
+
+        for(l = 0; l < states; l++)
+          sum[l] = left[l] * right[l];
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        left  = &x1[states * i];
+        right = &x2[states * i];
+        sum = &sumtable[states * i];
+
+        for(l = 0; l < states; l++)
+          sum[l] = left[l] * right[l];
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+#endif
+
+
+
+#if (!defined(__SSE3) && !defined(__AVX))
+
+/* same thing for GAMMA models. The only noteworthy thing here is that we have an additional inner loop over the 
+   number of discrete gamma rates. The data access pattern is also different since for tip vector accesses through our 
+   lookup table, we do not distnguish between rates 
+
+   Note the different access pattern in PLL_TIP_INNER:
+
+   left = &(tipVector[states * tipX1[i]]);        
+   right = &(x2[span * i + l * states]);
+
+*/
+
+static void sumGAMMA_FLEX(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, const int states)
+{
+  int 
+    i, 
+    l, 
+    k;
+
+  const int 
+    span = 4 * states;
+
+  double 
+    *left, 
+    *right, 
+    *sum;
+
+
+
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      for(i = 0; i < n; i++)
+      {
+        left  = &(tipVector[states * tipX1[i]]);
+        right = &(tipVector[states * tipX2[i]]);
+
+        for(l = 0; l < 4; l++)
+        {
+          sum = &sumtable[i * span + l * states];
+
+          for(k = 0; k < states; k++)
+            sum[k] = left[k] * right[k];
+
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      //reorder_back( x2, n, span );
+      for(i = 0; i < n; i++)
+      {
+        left = &(tipVector[states * tipX1[i]]);
+
+        for(l = 0; l < 4; l++)
+        {
+          right = &(x2[span * i + l * states]);
+          sum = &sumtable[i * span + l * states];
+
+          for(k = 0; k < states; k++)
+            sum[k] = left[k] * right[k];
+
+        }
+      }
+      //reorder( x2, n, span );
+      break;
+    case PLL_INNER_INNER:
+      //reorder_back( x1, n, span );
+      //reorder_back( x2, n, span );
+      for(i = 0; i < n; i++)
+      {
+        for(l = 0; l < 4; l++)
+        {
+          left  = &(x1[span * i + l * states]);
+          right = &(x2[span * i + l * states]);
+          sum   = &(sumtable[i * span + l * states]);
+
+
+          for(k = 0; k < states; k++)
+            sum[k] = left[k] * right[k];
+        }
+      }
+      //reorder( x1, n, span );
+      //reorder( x2, n, span );
+      break;
+    default:
+      assert(0);
+  }
+}
+#endif
+
+/* optimized functions for branch length optimization */
+
+
+#if (defined(__SSE3) || defined(__AVX))
+
+static void sumCAT_SAVE(int tipCase, double *sum, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+static void sumGAMMA_GAPPED_SAVE(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, 
+    double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+static void sumGAMMA(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n);
+
+static void sumCAT(int tipCase, double *sum, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n);
+
+static void sumGAMMAPROT_GAPPED_SAVE(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, 
+    double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+static void sumGAMMAPROT_LG4(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector[4],
+                             unsigned char *tipX1, unsigned char *tipX2, int n);
+
+static void sumGAMMAPROT(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n);
+
+static void sumGTRCATPROT(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n);
+
+static void sumGTRCATPROT_SAVE(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, 
+    double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+static void coreGTRGAMMAPROT_LG4(double *gammaRates, double *EIGN[4], double *sumtable, int upper, int *wrptr,
+                                 volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double lz,
+                                 double * lg4_weights);
+
+static void coreGTRGAMMA(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN, double *gammaRates, double lz, int *wrptr);
+
+static void coreGTRCAT(int upper, int numberOfCategories, double *sum,
+    volatile double *d1, volatile double *d2, int *wgt, 
+    double *rptr, double *EIGN, int *cptr, double lz);
+
+
+static void coreGTRGAMMAPROT(double *gammaRates, double *EIGN, double *sumtable, int upper, int *wrptr,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double lz);
+
+static void coreGTRCATPROT(double *EIGN, double lz, int numberOfCategories, double *rptr, int *cptr, int upper,
+    int *wgt, volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *sumtable);
+
+#endif
+
+
+/* now this is the core function of the newton-Raphson based branch length optimization that actually computes 
+   the first and second derivative of the likelihood given a new proposed branch length lz */
+
+static void ascertainmentBiasSequence(unsigned char tip[32], int numStates)
+{ 
+  assert(numStates <= 32 && numStates > 1);
+
+  switch(numStates)
+    {
+    case 2:     
+      tip[0] = 1;
+      tip[1] = 2;
+      break;
+    case 4:
+      tip[0] = 1;
+      tip[1] = 2;
+      tip[2] = 4;
+      tip[3] = 8;
+      break;
+    default:
+      {
+	int 
+	  i;
+	for(i = 0; i < numStates; i++)
+	  {
+	    tip[i] = i;
+	    //printf("%c ", inverseMeaningPROT[i]);
+	  }
+	//printf("\n");
+      }
+      break;
+    }
+}
+
+static double coreCatAsc(double *EIGN, double *sumtable, int upper,
+			 volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double lz, const int numStates,
+			 double *ascScaler)
+{
+  double  
+    diagptable[1024], 
+    lh = 0.0,
+    dlnLdlz = 0.0,
+    d2lnLdlz2 = 0.0,
+    ki, 
+    kisqr;
+
+  int     
+    i,     
+    l;  
+
+ 
+  ki = 1.0;
+  kisqr = 1.0;
+
+  for(l = 1; l < numStates; l++)
+    {
+      diagptable[l * 4]     = exp(EIGN[l-1] * ki * lz);
+      diagptable[l * 4 + 1] = EIGN[l-1] * ki;
+      diagptable[l * 4 + 2] = EIGN[l-1] * EIGN[l-1] * kisqr;
+    }
+
+  for (i = 0; i < upper; i++)
+    {
+      double
+	*sum = &sumtable[i * numStates],
+	tmp,
+	inv_Li   = 0.0,
+	dlnLidlz = 0.0,
+	d2lnLidlz2 = 0.0;
+
+    
+      inv_Li += sum[0];
+
+      for(l = 1; l < numStates; l++)
+	{
+	  inv_Li     += (tmp = diagptable[l * 4] * sum[l]);
+	  dlnLidlz   += tmp * diagptable[l * 4 + 1];
+	  d2lnLidlz2 += tmp * diagptable[l * 4 + 2];
+	}	            
+            
+      inv_Li = fabs(inv_Li);             
+       
+      lh        += inv_Li * ascScaler[i];
+      dlnLdlz   += dlnLidlz * ascScaler[i];
+      d2lnLdlz2 += d2lnLidlz2 * ascScaler[i];
+    } 
+
+  *ext_dlnLdlz   = (dlnLdlz / (lh - 1.0));
+  *ext_d2lnLdlz2 = (((lh - 1.0) * (d2lnLdlz2) - (dlnLdlz * dlnLdlz)) / ((lh - 1.0) * (lh - 1.0)));  
+
+  return lh;
+}
+
+
+static double coreGammaAsc(double *gammaRates, double *EIGN, double *sumtable, int upper,
+			   volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double lz, const int numStates,
+			   double *ascScaler)
+{
+  double  
+    diagptable[1024], 
+    lh = 0.0,
+    dlnLdlz = 0.0,
+    d2lnLdlz2 = 0.0,
+    ki, 
+    kisqr;
+
+  int     
+    i, 
+    j, 
+    l;  
+
+  const int 
+    gammaStates = 4 * numStates;
+
+  for(i = 0; i < 4; i++)
+    {
+      ki = gammaRates[i];
+      kisqr = ki * ki;
+
+      for(l = 1; l < numStates; l++)
+	{
+	  diagptable[i * gammaStates + l * 4]     = exp(EIGN[l-1] * ki * lz);
+	  diagptable[i * gammaStates + l * 4 + 1] = EIGN[l-1] * ki;
+	  diagptable[i * gammaStates + l * 4 + 2] = EIGN[l-1] * EIGN[l-1] * kisqr;
+	}
+    }
+
+  for (i = 0; i < upper; i++)
+    {
+      double
+	*sum = &sumtable[i * gammaStates],
+	tmp,
+	inv_Li   = 0.0,
+	dlnLidlz = 0.0,
+	d2lnLidlz2 = 0.0;
+
+      for(j = 0; j < 4; j++)
+	{
+	  inv_Li += sum[j * numStates];
+
+	  for(l = 1; l < numStates; l++)
+	    {
+	      inv_Li     += (tmp = diagptable[j * gammaStates + l * 4] * sum[j * numStates + l]);
+	      dlnLidlz   += tmp * diagptable[j * gammaStates + l * 4 + 1];
+	      d2lnLidlz2 += tmp * diagptable[j * gammaStates + l * 4 + 2];
+	    }	  
+	}    
+            
+      inv_Li = 0.25 * fabs(inv_Li);         
+      dlnLidlz *= 0.25;
+      d2lnLidlz2 *= 0.25;
+       
+      lh        += inv_Li * ascScaler[i];
+      dlnLdlz   += dlnLidlz * ascScaler[i];
+      d2lnLdlz2 += d2lnLidlz2 * ascScaler[i];
+    } 
+
+  *ext_dlnLdlz   = (dlnLdlz / (lh - 1.0));
+  *ext_d2lnLdlz2 = (((lh - 1.0) * (d2lnLdlz2) - (dlnLdlz * dlnLdlz)) / ((lh - 1.0) * (lh - 1.0)));  
+
+  return lh;
+}
+
+static void sumCatAsc(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+			int n, const int numStates)
+{
+  int i, k;
+  double *left, *right, *sum;
+
+  unsigned char 
+    tip[32];
+
+  ascertainmentBiasSequence(tip, numStates);
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      for(i = 0; i < n; i++)
+	{
+	  left  = &(tipVector[numStates * tip[i]]);
+	  right = &(tipVector[numStates * tip[i]]);
+
+	  
+	  sum = &sumtable[i * numStates];
+	  
+	  for(k = 0; k < numStates; k++)
+	    sum[k] = left[k] * right[k];	  
+	}
+      break;
+    case PLL_TIP_INNER:
+      for(i = 0; i < n; i++)
+	{
+	  left = &(tipVector[numStates * tip[i]]);
+
+	  
+	  right = &(x2[i * numStates]);
+	  sum = &sumtable[i * numStates];
+
+	  for(k = 0; k < numStates; k++)
+	    sum[k] = left[k] * right[k];	 
+	}
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+	{
+	  left  = &(x1[i * numStates]);
+	  right = &(x2[i * numStates]);
+	  sum   = &(sumtable[i * numStates]);
+
+	  for(k = 0; k < numStates; k++)
+	    sum[k] = left[k] * right[k];	 
+	}
+      break;
+    default:
+      assert(0);
+    }
+}
+
+static void sumGammaAsc(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+			int n, const int numStates)
+{
+  int i, l, k;
+  double *left, *right, *sum;
+
+  const int gammaStates = numStates * 4;
+
+  unsigned char 
+    tip[32];
+
+  ascertainmentBiasSequence(tip, numStates);
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      for(i = 0; i < n; i++)
+	{
+	  left  = &(tipVector[numStates * tip[i]]);
+	  right = &(tipVector[numStates * tip[i]]);
+
+	  for(l = 0; l < 4; l++)
+	    {
+	      sum = &sumtable[i * gammaStates + l * numStates];
+	      for(k = 0; k < numStates; k++)
+		sum[k] = left[k] * right[k];
+	    }
+	}
+      break;
+    case PLL_TIP_INNER:
+      for(i = 0; i < n; i++)
+	{
+	  left = &(tipVector[numStates * tip[i]]);
+
+	  for(l = 0; l < 4; l++)
+	    {
+	      right = &(x2[gammaStates * i + l * numStates]);
+	      sum = &sumtable[i * gammaStates + l * numStates];
+
+	      for(k = 0; k < numStates; k++)
+		sum[k] = left[k] * right[k];
+	    }
+	}
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+	{
+	  for(l = 0; l < 4; l++)
+	    {
+	      left  = &(x1[gammaStates * i + l * numStates]);
+	      right = &(x2[gammaStates * i + l * numStates]);
+	      sum   = &(sumtable[i * gammaStates + l * numStates]);
+
+	      for(k = 0; k < numStates; k++)
+		sum[k] = left[k] * right[k];
+	    }
+	}
+      break;
+    default:
+      assert(0);
+    }
+}
+
+
+
+
+#if (!defined(__AVX) && !defined(__SSE3))
+static void coreCAT_FLEX(int upper, int numberOfCategories, double *sum,
+    volatile double *d1, volatile double *d2, int *wgt,
+    double *rptr, double *EIGN, int *cptr, double lz, const int states)
+    /* rptr perSiteRates pointer, cptr rateCategory pointer */
+{
+  int 
+    i, 
+    l;
+
+  double 
+    *d, 
+
+    /* arrays to store stuff we can pre-compute */
+    *d_start = NULL,
+    *e = NULL,
+    *s = NULL,
+    *dd = NULL,
+    inv_Li, 
+    dlnLidlz, 
+    d2lnLidlz2,
+    dlnLdlz = 0.0,
+    d2lnLdlz2 = 0.0;
+
+  rax_posix_memalign ((void **) &d_start, PLL_BYTE_ALIGNMENT, numberOfCategories * states * sizeof(double));
+  rax_posix_memalign ((void **) &e,       PLL_BYTE_ALIGNMENT, (states * sizeof(double)));
+  rax_posix_memalign ((void **) &s,       PLL_BYTE_ALIGNMENT, states * sizeof(double));
+  rax_posix_memalign ((void **) &dd,      PLL_BYTE_ALIGNMENT, states * sizeof(double)),
+  d = d_start;
+
+  e[0] = 0.0;
+  s[0] = 0.0; 
+  dd[0] = 0.0;
+
+
+  /* we are pre-computing values for computing the first and second derivative of P(lz)
+     since this requires an exponetial that the only thing we really have to derive here */
+
+  for(l = 1; l < states; l++)
+  { 
+    s[l]  = EIGN[l];
+    e[l]  = EIGN[l] * EIGN[l];     
+    dd[l] = s[l] * lz;
+  }
+
+  /* compute the P matrices and their derivatives for 
+     all per-site rate categories */
+
+  for(i = 0; i < numberOfCategories; i++)
+  {      
+    d[states * i] = 1.0;
+    for(l = 1; l < states; l++)
+      d[states * i + l] = exp(dd[l] * rptr[i]);
+  }
+
+
+  /* now loop over the sites in this partition to obtain the per-site 1st and 2nd derivatives */
+
+  for (i = 0; i < upper; i++)
+  {    
+    /* get the correct p matrix for the rate at the current site i */
+
+    d = &d_start[states * cptr[i]];      
+
+    /* this is the likelihood at site i, NOT the log likelihood, we don't need the log 
+       likelihood to compute derivatives ! */
+
+    inv_Li     = sum[states * i]; 
+
+    /* those are for storing the first and second derivative of the Likelihood at site i */
+
+    dlnLidlz   = 0.0;
+    d2lnLidlz2 = 0.0;
+
+    /* now multiply the likelihood and the first and second derivative with the 
+       appropriate derivatives of P(lz) */
+
+    for(l = 1; l < states; l++)
+    {
+      double
+        tmpv = d[l] * sum[states * i + l];
+
+      inv_Li     += tmpv;                 
+      dlnLidlz   += tmpv * s[l];       
+      d2lnLidlz2 += tmpv * e[l];
+    }     
+
+    /* below we are implementing the other mathematical operations that are required 
+       to obtain the deirivatives */
+
+    inv_Li = 1.0 / fabs (inv_Li);
+
+    dlnLidlz   *= inv_Li;
+    d2lnLidlz2 *= inv_Li;
+
+    /* compute the accumulated first and second derivatives of this site */
+
+    dlnLdlz  += wgt[i] * rptr[cptr[i]] * dlnLidlz;
+    d2lnLdlz2 += wgt[i] * rptr[cptr[i]] * rptr[cptr[i]] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+  }
+
+  /* 
+     set the result values, i.e., the sum of the per-site first and second derivatives of the likelihood function 
+     for this partition. 
+     */
+
+  *d1  = dlnLdlz;
+  *d2 = d2lnLdlz2;
+
+  /* free the temporary arrays */
+
+  rax_free(d_start);
+  rax_free(e);
+  rax_free(s);
+  rax_free(dd);
+}
+
+static void coreGAMMA_FLEX(int upper, double *sumtable, volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, 
+    double *EIGN, double *gammaRates, double lz, int *wrptr, const int states)
+{
+  double  
+    *sum, 
+    diagptable[1024], /* TODO make this dynamic */
+    dlnLdlz = 0.0,
+    d2lnLdlz2 = 0.0,
+    ki, 
+    kisqr,
+    tmp,
+    inv_Li, 
+    dlnLidlz, 
+    d2lnLidlz2;
+
+  int     
+    i, 
+    j, 
+    l;  
+
+  const int 
+    gammaStates = 4 * states;
+
+  /* pre-compute the derivatives of the P matrix for all discrete GAMMA rates */
+
+  for(i = 0; i < 4; i++)
+  {
+    ki = gammaRates[i];
+    kisqr = ki * ki;
+
+    for(l = 1; l < states; l++)
+    {
+      diagptable[i * gammaStates + l * 4]     = exp(EIGN[l] * ki * lz);
+      diagptable[i * gammaStates + l * 4 + 1] = EIGN[l] * ki;
+      diagptable[i * gammaStates + l * 4 + 2] = EIGN[l] * EIGN[l] * kisqr;
+    }
+  }
+
+  /* loop over sites in this partition */
+
+  for (i = 0; i < upper; i++)
+  {
+    /* access the array with pre-computed values */
+    sum = &sumtable[i * gammaStates];
+
+    /* initial per-site likelihood and 1st and 2nd derivatives */
+
+    inv_Li   = 0.0;
+    dlnLidlz = 0.0;
+    d2lnLidlz2 = 0.0;
+
+    /* loop over discrete GAMMA rates */
+
+    for(j = 0; j < 4; j++)
+    {
+      inv_Li += sum[j * states];
+
+      for(l = 1; l < states; l++)
+      {
+        inv_Li     += (tmp = diagptable[j * gammaStates + l * 4] * sum[j * states + l]);
+        dlnLidlz   +=  tmp * diagptable[j * gammaStates + l * 4 + 1];
+        d2lnLidlz2 +=  tmp * diagptable[j * gammaStates + l * 4 + 2];
+      }
+    }
+
+    /* finalize derivative computation */
+    /* note that wrptr[] here unlike in CAT above is the 
+       integer weight vector of the current site 
+
+       The operations:
+
+       EIGN[l] * ki;
+       EIGN[l] * EIGN[l] * kisqr;
+
+       that are hidden in CAT in wrptr (at least the * ki and * ki *ki part of them 
+       are done explicitely here 
+
+*/
+
+    inv_Li = 1.0 / fabs (inv_Li);
+
+    dlnLidlz   *= inv_Li;
+    d2lnLidlz2 *= inv_Li;
+
+    dlnLdlz   += wrptr[i] * dlnLidlz;
+    d2lnLdlz2 += wrptr[i] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+  }
+
+  *ext_dlnLdlz   = dlnLdlz;
+  *ext_d2lnLdlz2 = d2lnLdlz2;
+
+}
+#endif
+
+//void sumGAMMA_FLEX_reorder(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+//    unsigned char *tipX1, unsigned char *tipX2, int n, const int states);
+
+/** @brief Precompute values (sumtable) from the 2 likelihood vectors of a given branch
+ *
+ * @warning These precomputations are stored in \a tr->partitionData[model].sumBuffer, which is used by function \a execCore
+ *
+ * @param tr
+ *   Library instance
+ *
+ * @warning the given branch is implicitly defined in \a tr by these nodes:
+ * pNumber = tr->td[0].ti[0].pNumber;
+ * qNumber = tr->td[0].ti[0].qNumber;
+ *
+ *
+ * @note This function should be called only once at the very beginning of each Newton-Raphson procedure for optimizing barnch lengths. It initially invokes an iterative newview call to get a consistent pair of vectors at the left and the right end of the branch and thereafter invokes the one-time only precomputation of values (sumtable) that can be re-used in each Newton-Raphson iteration. Once this function has been called we can execute the actual NR procedure
+ *
+ *
+ */
+void makenewzIterative(pllInstance *tr, partitionList * pr)
+{
+  int 
+    model, 
+    tipCase;
+
+  double
+    *x1_start     = NULL,
+    *x2_start     = NULL,
+    *x1_start_asc = NULL,
+    *x2_start_asc = NULL;
+
+
+  unsigned char
+    *tipX1,
+    *tipX2;
+
+  double
+    *x1_gapColumn = (double*)NULL,
+    *x2_gapColumn = (double*)NULL;
+
+  unsigned int
+    *x1_gap = (unsigned int*)NULL,
+    *x2_gap = (unsigned int*)NULL;                            
+
+  /* call newvieIterative to get the likelihood arrays to the left and right of the branch */
+
+  pllNewviewIterative(tr, pr, 1);
+
+
+  /* 
+     loop over all partoitions to do the precomputation of the sumTable buffer 
+     This is analogous to the pllNewviewIterative() and pllEvaluateIterative() 
+     implementations.
+     */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+  { 
+    int 
+      width = pr->partitionData[model]->width;
+
+    if(tr->td[0].executeModel[model] && width > 0)
+    {
+      int          
+        states = pr->partitionData[model]->states;
+
+
+      getVects(tr, pr, &tipX1, &tipX2, &x1_start, &x2_start, &tipCase, model, &x1_gapColumn, &x2_gapColumn, &x1_gap, &x2_gap, &x1_start_asc, &x2_start_asc);
+
+#if (!defined(__SSE3) && !defined(__AVX) && !defined(__MIC_NATIVE))
+      assert(!tr->saveMemory);
+      if(tr->rateHetModel == PLL_CAT)
+        sumCAT_FLEX(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+            width, states);
+      else
+        //sumGAMMA_FLEX_reorder(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+          sumGAMMA_FLEX(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+            width, states);
+#else
+      switch(states)
+      {
+      case 2: /* BINARY */
+          assert(!tr->saveMemory);
+          if (tr->rateHetModel == PLL_CAT)
+            sumCAT_BINARY(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                          width);
+          else
+            sumGAMMA_BINARY(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                            width);
+          break;
+      case 4: /* DNA */
+#ifdef __MIC_NATIVE
+      assert(!tr->saveMemory);
+      assert(tr->rateHetModel == PLL_GAMMA);
+
+      sumGTRGAMMA_MIC(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+          width);
+#else
+          if(tr->rateHetModel == PLL_CAT)
+          {
+            if(tr->saveMemory)
+              sumCAT_SAVE(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                  width, x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+            else
+              sumCAT(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                  width);
+          }
+          else
+          {
+            if(tr->saveMemory)
+              sumGAMMA_GAPPED_SAVE(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                  width, x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+            else
+              sumGAMMA(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                  width);
+          }
+#endif
+          break;                
+        case 20: /* proteins */
+#ifdef __MIC_NATIVE
+          assert(!tr->saveMemory);
+          assert(tr->rateHetModel == PLL_GAMMA);
+
+              if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                          sumGTRGAMMAPROT_LG4_MIC(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector_LG4, tipX1, tipX2,
+                                  width);
+              else
+                          sumGTRGAMMAPROT_MIC(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                                  width);
+#else
+
+            if(tr->rateHetModel == PLL_CAT)
+          {
+            if(tr->saveMemory)
+              sumGTRCATPROT_SAVE(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector,
+                  tipX1, tipX2, width, x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+            else                      
+              sumGTRCATPROT(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector,
+                  tipX1, tipX2, width);
+          }
+          else
+          {
+
+            if(tr->saveMemory)
+              sumGAMMAPROT_GAPPED_SAVE(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                  width, x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+              else
+                    {
+                      if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                        sumGAMMAPROT_LG4(tipCase,  pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector_LG4,
+                                         tipX1, tipX2, width);
+            else
+              sumGAMMAPROT(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector,
+                  tipX1, tipX2, width);
+                    }
+          }
+#endif
+          break;                
+        default:
+          assert(0);
+      }
+#endif
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+      if (pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+      if (pr->partitionData[model]->ascBias)
+#endif
+       {
+            int pNumber = tr->td[0].ti[0].pNumber, qNumber =
+                    tr->td[0].ti[0].qNumber, i, *ex1_asc =
+                    &pr->partitionData[model]->ascExpVector[(pNumber
+                            - tr->mxtips - 1) * states], *ex2_asc =
+                    &pr->partitionData[model]->ascExpVector[(qNumber
+                            - tr->mxtips - 1) * states];
+            switch (tipCase)
+            {
+            case PLL_TIP_TIP:
+                assert(0);
+                break;
+            case PLL_TIP_INNER:
+                if (isTip(pNumber, tr->mxtips))
+                {
+                    for (i = 0; i < states; i++)
+                        pr->partitionData[model]->ascScaler[i] = pow(
+                                PLL_MINLIKELIHOOD, (double) ex2_asc[i]);
+                }
+                else
+                {
+                    for (i = 0; i < states; i++)
+                        pr->partitionData[model]->ascScaler[i] = pow(
+                                PLL_MINLIKELIHOOD, (double) ex1_asc[i]);
+                }
+                break;
+            case PLL_INNER_INNER:
+                for (i = 0; i < states; i++)
+                    pr->partitionData[model]->ascScaler[i] = pow(
+                            PLL_MINLIKELIHOOD,
+                            (double) (ex1_asc[i] + ex2_asc[i]));
+                break;
+            default:
+                assert(0);
+            }
+         if (tr->rateHetModel == PLL_CAT)
+           sumCatAsc  (tipCase, pr->partitionData[model]->ascSumBuffer, x1_start_asc, x2_start_asc, pr->partitionData[model]->tipVector, states, states);
+         else
+           sumGammaAsc(tipCase, pr->partitionData[model]->ascSumBuffer, x1_start_asc, x2_start_asc, pr->partitionData[model]->tipVector, states, states);
+       }
+    }
+  }
+}
+
+
+/** @brief Compute first and second derivatives of the likelihood with respect to a given branch length 
+ *
+ * @param tr
+ *   library instance
+ *
+ * @param _dlnLdlz 
+ *   First derivative dl/dlz
+ *
+ * @param _d2lnLdlz2
+ *   Second derivative d(dl/dlz)/dlz
+ *
+ * @warning \a makenewzIterative should have been called to precompute \a tr->partitionData[model].sumBuffer at the given branch
+ *
+ * @note  this function actually computes the first and second derivatives of the likelihood for a given branch stored in tr->coreLZ[model] Note that in the parallel case coreLZ must always be broadcasted together with the traversal descriptor, at least for optimizing branch lengths 
+ *
+ */
+void execCore(pllInstance *tr, partitionList *pr, volatile double *_dlnLdlz, volatile double *_d2lnLdlz2)
+{
+  int model, branchIndex;
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  double lz;
+
+  _dlnLdlz[0]   = 0.0;
+  _d2lnLdlz2[0] = 0.0;
+
+  /* loop over partitions */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+  {
+    int 
+      width = pr->partitionData[model]->width;
+
+    /* check if we (the present thread for instance) needs to compute something at 
+       all for the present partition */
+
+    if(tr->td[0].executeModel[model] && width > 0)
+    {
+      int           
+        states = pr->partitionData[model]->states;
+
+      double 
+        *sumBuffer       = (double*)NULL;
+
+
+      volatile double
+        dlnLdlz   = 0.0,
+                  d2lnLdlz2 = 0.0;
+
+      /* set a pointer to the part of the pre-computed sumBuffer we are going to access */
+
+      sumBuffer = pr->partitionData[model]->sumBuffer;
+
+      /* figure out if we are optimizing branch lengths individually per partition or jointly across 
+         all partitions. If we do this on a per partition basis, we also need to compute and store 
+         the per-partition derivatives of the likelihood separately, otherwise not */
+
+      if(numBranches > 1)
+      {
+        branchIndex = model;          
+        lz = tr->td[0].parameterValues[model];
+        _dlnLdlz[model]   = 0.0;
+        _d2lnLdlz2[model] = 0.0;
+      }
+      else
+      {
+        branchIndex = 0;              
+        lz = tr->td[0].parameterValues[0];
+      }
+
+#if (!defined(__SSE3) && !defined(__AVX) && !defined(__MIC_NATIVE))
+      /* compute first and second derivatives with the slow generic functions */
+
+      if(tr->rateHetModel == PLL_CAT)
+        coreCAT_FLEX(width, pr->partitionData[model]->numberOfCategories, sumBuffer,
+            &dlnLdlz, &d2lnLdlz2, pr->partitionData[model]->wgt,
+            pr->partitionData[model]->perSiteRates, pr->partitionData[model]->EIGN,  pr->partitionData[model]->rateCategory, lz, states);
+      else
+        coreGAMMA_FLEX(width, sumBuffer,
+            &dlnLdlz, &d2lnLdlz2, pr->partitionData[model]->EIGN, pr->partitionData[model]->gammaRates, lz,
+            pr->partitionData[model]->wgt, states);
+#else
+      switch(states)
+       {    
+         case 2: /* BINARY */
+           if (tr->rateHetModel == PLL_CAT)
+              coreGTRCAT_BINARY(width, 
+                                pr->partitionData[model]->numberOfCategories, 
+                                sumBuffer,
+                                &dlnLdlz, 
+                                &d2lnLdlz2, 
+                                pr->partitionData[model]->perSiteRates, 
+                                pr->partitionData[model]->EIGN,  
+                                pr->partitionData[model]->rateCategory, 
+                                lz, 
+                                pr->partitionData[model]->wgt);
+           else
+              coreGTRGAMMA_BINARY(width, 
+                                   sumBuffer,
+                                   &dlnLdlz, 
+                                   &d2lnLdlz2, 
+                                   pr->partitionData[model]->EIGN,
+                                   pr->partitionData[model]->gammaRates, 
+                                   lz,
+                                   pr->partitionData[model]->wgt);
+           break;
+         case 4: /* DNA */
+#ifdef __MIC_NATIVE
+           assert(tr->rateHetModel == PLL_GAMMA);
+
+           coreGTRGAMMA_MIC(width, 
+                            sumBuffer,
+                            &dlnLdlz, 
+                            &d2lnLdlz2, 
+                            pr->partitionData[model]->EIGN, 
+                            pr->partitionData[model]->gammaRates, 
+                            lz,
+                            pr->partitionData[model]->wgt);
+#else
+          if(tr->rateHetModel == PLL_CAT)
+            coreGTRCAT(width, pr->partitionData[model]->numberOfCategories, sumBuffer,
+                &dlnLdlz, &d2lnLdlz2, pr->partitionData[model]->wgt,
+                pr->partitionData[model]->perSiteRates, pr->partitionData[model]->EIGN,  pr->partitionData[model]->rateCategory, lz);
+          else 
+            coreGTRGAMMA(width, sumBuffer,
+                &dlnLdlz, &d2lnLdlz2, pr->partitionData[model]->EIGN, pr->partitionData[model]->gammaRates, lz,
+                pr->partitionData[model]->wgt);
+
+#endif
+          break;                    
+        case 20: /* proteins */
+
+#ifdef __MIC_NATIVE
+      assert(tr->rateHetModel == PLL_GAMMA);
+
+          if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                  coreGTRGAMMAPROT_LG4_MIC(width, sumBuffer,
+                          &dlnLdlz, &d2lnLdlz2, pr->partitionData[model]->EIGN_LG4, pr->partitionData[model]->gammaRates, lz,
+                          pr->partitionData[model]->wgt, pr->partitionData[model]->lg4x_weights);
+          else
+                  coreGTRGAMMAPROT_MIC(width, sumBuffer,
+                          &dlnLdlz, &d2lnLdlz2, pr->partitionData[model]->EIGN, pr->partitionData[model]->gammaRates, lz,
+                          pr->partitionData[model]->wgt);
+#else
+
+          if(tr->rateHetModel == PLL_CAT)
+            coreGTRCATPROT(pr->partitionData[model]->EIGN, lz, pr->partitionData[model]->numberOfCategories,  pr->partitionData[model]->perSiteRates,
+                pr->partitionData[model]->rateCategory, width,
+                pr->partitionData[model]->wgt,
+                &dlnLdlz, &d2lnLdlz2,
+                sumBuffer);
+            else
+                { 
+                  if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                    coreGTRGAMMAPROT_LG4(pr->partitionData[model]->gammaRates, pr->partitionData[model]->EIGN_LG4,
+                                         sumBuffer, width, pr->partitionData[model]->wgt,
+                                         &dlnLdlz, &d2lnLdlz2, lz, pr->partitionData[model]->lg4x_weights);
+          else
+
+            coreGTRGAMMAPROT(pr->partitionData[model]->gammaRates, pr->partitionData[model]->EIGN,
+                sumBuffer, width, pr->partitionData[model]->wgt,
+                &dlnLdlz, &d2lnLdlz2, lz);
+            
+                }
+#endif
+          break;                   
+        default:
+          assert(0);
+      }
+#endif
+
+      /* store first and second derivative */
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+     if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+     if(pr->partitionData[model]->ascBias)
+#endif  
+       {
+         size_t
+           i;
+
+         double 
+           correction;
+
+         int            
+           w = 0;
+         
+         volatile double 
+           d1 = 0.0,
+           d2 = 0.0;                   
+         
+         for(i = (size_t)pr->partitionData[model]->lower; i < (size_t)pr->partitionData[model]->upper; i++)
+           w += tr->aliaswgt[i];     
+         
+          switch(tr->rateHetModel)
+            {
+            case PLL_CAT:
+              correction = coreCatAsc(pr->partitionData[model]->EIGN, pr->partitionData[model]->ascSumBuffer, states,
+                                        &d1,  &d2, lz, states, pr->partitionData[model]->ascScaler);
+              break;
+            case PLL_GAMMA:
+              correction = coreGammaAsc(pr->partitionData[model]->gammaRates, pr->partitionData[model]->EIGN, pr->partitionData[model]->ascSumBuffer, states,
+                                        &d1,  &d2, lz, states, pr->partitionData[model]->ascScaler);
+              break;
+            default:
+              assert(0);
+            }
+        
+         correction = 1.0 - correction;
+     
+         /* Lewis correction */
+         _dlnLdlz[branchIndex]   =  _dlnLdlz[branchIndex] + dlnLdlz - (double)w * d1;
+         _d2lnLdlz2[branchIndex] =  _d2lnLdlz2[branchIndex] + d2lnLdlz2-  (double)w * d2;
+           
+       }  
+      else
+       {
+         _dlnLdlz[branchIndex]   = _dlnLdlz[branchIndex]   + dlnLdlz;
+         _d2lnLdlz2[branchIndex] = _d2lnLdlz2[branchIndex] + d2lnLdlz2;
+       }
+    }
+    else
+    {
+      /* set to 0 to make the reduction operation consistent */
+
+      if(width == 0 && (numBranches > 1))
+      {
+        _dlnLdlz[model]   = 0.0;
+        _d2lnLdlz2[model] = 0.0;
+      }                                    
+    }
+  }
+
+}
+
+
+/* the function below actually implements the iterative Newton-Raphson procedure.
+   It is particularly messy and hard to read because for the case of per-partition branch length 
+   estimates it needs to keep track of whetehr the Newton Raphson procedure has 
+   converged for each partition individually. 
+
+   The rational efor doing it like this is also provided in:
+
+
+   A. Stamatakis, M. Ott: "Load Balance in the Phylogenetic Likelihood Kernel". Proceedings of ICPP 2009,
+
+*/
+
+static void topLevelMakenewz(pllInstance *tr, partitionList * pr, double *z0, int _maxiter, double *result)
+{
+  double   z[PLL_NUM_BRANCHES], zprev[PLL_NUM_BRANCHES], zstep[PLL_NUM_BRANCHES];
+  volatile double  dlnLdlz[PLL_NUM_BRANCHES], d2lnLdlz2[PLL_NUM_BRANCHES];
+  int i, maxiter[PLL_NUM_BRANCHES], model;
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+  pllBoolean firstIteration = PLL_TRUE;
+  pllBoolean outerConverged[PLL_NUM_BRANCHES];
+  pllBoolean loopConverged;
+
+
+  /* figure out if this is on a per partition basis or jointly across all partitions */
+
+
+
+  /* initialize loop convergence variables etc. 
+     maxiter is the maximum number of NR iterations we are going to do before giving up */
+
+  for(i = 0; i < numBranches; i++)
+  {
+    z[i] = z0[i];
+    maxiter[i] = _maxiter;
+    outerConverged[i] = PLL_FALSE;
+    tr->curvatOK[i]       = PLL_TRUE;
+  }
+
+
+  /* nested do while loops of Newton-Raphson */
+
+  do
+  {
+
+    /* check if we ar done for partition i or if we need to adapt the branch length again */
+
+    for(i = 0; i < numBranches; i++)
+    {
+      if(outerConverged[i] == PLL_FALSE && tr->curvatOK[i] == PLL_TRUE)
+      {
+        tr->curvatOK[i] = PLL_FALSE;
+
+        zprev[i] = z[i];
+
+        zstep[i] = (1.0 - PLL_ZMAX) * z[i] + PLL_ZMIN;
+      }
+    }
+
+    for(i = 0; i < numBranches; i++)
+    {
+      /* other case, the outer loop hasn't converged but we are trying to approach 
+         the maximum from the wrong side */
+
+      if(outerConverged[i] == PLL_FALSE && tr->curvatOK[i] == PLL_FALSE)
+      {
+        double lz;
+
+        if (z[i] < PLL_ZMIN) z[i] = PLL_ZMIN;
+        else if (z[i] > PLL_ZMAX) z[i] = PLL_ZMAX;
+        lz    = log(z[i]);
+
+        tr->coreLZ[i] = lz;
+      }
+    }
+
+
+    /* set the execution mask */
+
+    if(numBranches > 1)
+    {
+      for(model = 0; model < pr->numberOfPartitions; model++)
+      {
+        if(pr->partitionData[model]->executeModel)
+          pr->partitionData[model]->executeModel = !tr->curvatOK[model];
+
+      }
+    }
+    else
+    {
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        pr->partitionData[model]->executeModel = !tr->curvatOK[0];
+    }
+
+
+    /* store it in traversal descriptor */
+
+    storeExecuteMaskInTraversalDescriptor(tr, pr);
+
+    /* store the new branch length values to be tested in traversal descriptor */
+
+    storeValuesInTraversalDescriptor(tr, pr, &(tr->coreLZ[0]));
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+
+    /* if this is the first iteration of NR we will need to first do this one-time call 
+       of maknewzIterative() Note that, only this call requires broadcasting the traversal descriptor,
+       subsequent calls to pllMasterBarrier(PLL_THREAD_MAKENEWZ, tr); will not require this
+       */
+
+    if(firstIteration)
+      {
+        tr->td[0].traversalHasChanged = PLL_TRUE; 
+        pllMasterBarrier (tr, pr, PLL_THREAD_MAKENEWZ_FIRST);
+        firstIteration = PLL_FALSE; 
+        tr->td[0].traversalHasChanged = PLL_FALSE; 
+      }
+    else 
+      pllMasterBarrier(tr, pr, PLL_THREAD_MAKENEWZ);
+    branchLength_parallelReduce(tr, (double*)dlnLdlz, (double*)d2lnLdlz2, numBranches);
+#else 
+    /* sequential part, if this is the first newton-raphson implementation,
+       do the precomputations as well, otherwise just execute the computation
+       of the derivatives */
+    if(firstIteration)
+      {
+        makenewzIterative(tr, pr);
+        firstIteration = PLL_FALSE;
+      }
+    execCore(tr, pr, dlnLdlz, d2lnLdlz2);
+#endif
+
+    /* do a NR step, if we are on the correct side of the maximum that's okay, otherwise 
+       shorten branch */
+
+    for(i = 0; i < numBranches; i++)
+    {
+      if(outerConverged[i] == PLL_FALSE && tr->curvatOK[i] == PLL_FALSE)
+      {
+        if ((d2lnLdlz2[i] >= 0.0) && (z[i] < PLL_ZMAX))
+          zprev[i] = z[i] = 0.37 * z[i] + 0.63;  /*  Bad curvature, shorten branch */
+        else
+          tr->curvatOK[i] = PLL_TRUE;
+      }
+    }
+
+    /* do the standard NR step to obrain the next value, depending on the state for eahc partition */
+
+    for(i = 0; i < numBranches; i++)
+    {
+      if(tr->curvatOK[i] == PLL_TRUE && outerConverged[i] == PLL_FALSE)
+      {
+        if (d2lnLdlz2[i] < 0.0)
+        {
+          double tantmp = -dlnLdlz[i] / d2lnLdlz2[i];
+          if (tantmp < 100)
+          {
+            z[i] *= exp(tantmp);
+            if (z[i] < PLL_ZMIN)
+              z[i] = PLL_ZMIN;
+
+            if (z[i] > 0.25 * zprev[i] + 0.75)
+              z[i] = 0.25 * zprev[i] + 0.75;
+          }
+          else
+            z[i] = 0.25 * zprev[i] + 0.75;
+        }
+        if (z[i] > PLL_ZMAX) z[i] = PLL_ZMAX;
+
+        /* decrement the maximum number of itarations */
+
+        maxiter[i] = maxiter[i] - 1;
+
+        /* check if the outer loop has converged */
+
+        //old code below commented out, integrated new PRELIMINARY BUG FIX !
+        //this needs further work at some point!
+
+        /*
+        if(maxiter[i] > 0 && (PLL_ABS(z[i] - zprev[i]) > zstep[i]))
+          outerConverged[i] = PLL_FALSE;
+        else
+          outerConverged[i] = PLL_TRUE;
+        */
+
+        if((PLL_ABS(z[i] - zprev[i]) > zstep[i]))
+         {
+           /* We should make a more informed decision here,
+              based on the log like improvement */
+
+           if(maxiter[i] < -20)
+            {
+              z[i] = z0[i];
+              outerConverged[i] = PLL_TRUE;
+            }
+           else
+             outerConverged[i] = PLL_FALSE;
+         }
+        else
+          outerConverged[i] = PLL_TRUE;
+      }
+    }
+
+    /* check if the loop has converged for all partitions */
+
+    loopConverged = PLL_TRUE;
+    for(i = 0; i < numBranches; i++)
+      loopConverged = loopConverged && outerConverged[i];
+  }
+  while (!loopConverged);
+
+
+  /* reset  partition execution mask */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    pr->partitionData[model]->executeModel = PLL_TRUE;
+
+  /* copy the new branches in the result array of branches.
+     if we don't do a per partition estimate of 
+     branches this will only set result[0]
+     */
+
+  for(i = 0; i < numBranches; i++)
+    result[i] = z[i];
+}
+
+
+/** @brief Optimize branch length value(s) of a given branch with the Newton-Raphtson procedure 
+ *
+ * @warning A given branch may have one or several branch length values (up to PLL_NUM_BRANCHES), usually the later refers to partition-specific branch length values. Thus z0 and result represent collections rather than double values. The number of branch length values is given by \a tr->numBranches 
+ *
+ * @param tr
+ *   Library instance
+ *
+ * @param p
+ *   One node that defines the branch (p->z)
+ *
+ * @param q
+ *   The other node side of the branch (usually p->back), but the branch length can be estimated even if p and q are
+ *   not connected, e.g. before the insertion of a subtree.
+ *
+ * @param z0 
+ *   Initial branch length value(s) for the given branch \a p->z 
+ *
+ * @param maxiter 
+ *   Maximum number of iterations in the Newton-Raphson procedure 
+ *
+ * @param result 
+ *   Resulting branch length value(s) for the given branch \a p->z 
+ *
+ * @param mask 
+ *   Specifies if a mask to track partition convergence (\a tr->partitionConverged) is being used.
+ *
+ * @sa typical values for \a maxiter are constants \a iterations and \a PLL_NEWZPERCYCLE
+ * @note Requirement: q->z == p->z
+ */
+void makenewzGeneric(pllInstance *tr, partitionList * pr, nodeptr p, nodeptr q, double *z0, int maxiter, double *result, pllBoolean mask)
+{
+  int i;
+  //boolean originalExecute[PLL_NUM_BRANCHES];
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  pllBoolean 
+    p_recom = PLL_FALSE, /* if one of was missing, we will need to force recomputation */
+    q_recom = PLL_FALSE;
+
+  /* the first entry of the traversal descriptor stores the node pair that defines 
+     the branch */
+
+  tr->td[0].ti[0].pNumber = p->number;
+  tr->td[0].ti[0].qNumber = q->number;
+
+  for(i = 0; i < numBranches; i++)
+  {
+    //originalExecute[i] =  pr->partitionData[i]->executeModel;
+    tr->td[0].ti[0].qz[i] =  z0[i];
+    if(mask)
+    {
+      if (tr->partitionConverged[i])
+        pr->partitionData[i]->executeModel = PLL_FALSE;
+      else
+        pr->partitionData[i]->executeModel = PLL_TRUE;
+    }
+  }
+  if (tr->useRecom)
+  {
+    int
+      slot = -1;
+      //count = 0;
+
+    /* Ensure p and q get a unpinnable slot in physical memory */
+    if(!isTip(q->number, tr->mxtips))
+    {
+      q_recom = getxVector(tr->rvec, q->number, &slot, tr->mxtips);
+      tr->td[0].ti[0].slot_q = slot;
+    }
+    if(!isTip(p->number, tr->mxtips))
+    {
+      p_recom = getxVector(tr->rvec, p->number, &slot, tr->mxtips);
+      tr->td[0].ti[0].slot_p = slot;
+    }
+  }
+
+
+  /* compute the traversal descriptor of the likelihood vectors  that need to be re-computed 
+     first in makenewzIterative */
+
+  tr->td[0].count = 1;
+
+  if(p_recom || needsRecomp(tr->useRecom, tr->rvec, p, tr->mxtips))
+    computeTraversal(tr, p, PLL_TRUE, numBranches);
+
+  if(q_recom || needsRecomp(tr->useRecom, tr->rvec, q, tr->mxtips))
+    computeTraversal(tr, q, PLL_TRUE, numBranches);
+
+  /* call the Newton-Raphson procedure */
+
+  topLevelMakenewz(tr, pr, z0, maxiter, result);
+
+  /* Mark node as unpinnable */
+  if(tr->useRecom)
+  {
+    unpinNode(tr->rvec, p->number, tr->mxtips);
+    unpinNode(tr->rvec, q->number, tr->mxtips);
+  }
+
+  /* fix eceuteModel this seems to be a bit redundant with topLevelMakenewz */ 
+
+  for(i = 0; i < numBranches; i++)
+    pr->partitionData[i]->executeModel = PLL_TRUE;
+}
+
+
+/* below are, once again the optimized functions */
+
+#if (defined(__SSE3) || defined(__AVX))
+
+
+static void sumCAT_BINARY(int tipCase, double *sum, double *x1_start, double *x2_start, double *tipVector,
+                          unsigned char *tipX1, unsigned char *tipX2, int n)
+
+{
+  int i;
+  
+#if (!defined(__SSE3) && !defined(__AVX))
+  int j;
+#endif
+  double *x1, *x2;
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      for (i = 0; i < n; i++)
+        {
+          x1 = &(tipVector[2 * tipX1[i]]);
+          x2 = &(tipVector[2 * tipX2[i]]);
+
+#if (!defined(__SSE3) && !defined(__AVX))
+          for(j = 0; j < 2; j++)
+            sum[i * 2 + j]     = x1[j] * x2[j];
+#else
+          _mm_store_pd(&sum[i * 2], _mm_mul_pd( _mm_load_pd(x1), _mm_load_pd(x2)));
+#endif
+        }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+        {
+          x1 = &(tipVector[2 * tipX1[i]]);
+          x2 = &x2_start[2 * i];
+
+#if (!defined(__SSE3) && !defined(__AVX))
+          for(j = 0; j < 2; j++)
+            sum[i * 2 + j]     = x1[j] * x2[j];
+#else
+          _mm_store_pd(&sum[i * 2], _mm_mul_pd( _mm_load_pd(x1), _mm_load_pd(x2)));  
+#endif
+        }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+        {
+          x1 = &x1_start[2 * i];
+          x2 = &x2_start[2 * i];
+#if (!defined(__SSE3) && !defined(__AVX))
+          for(j = 0; j < 2; j++)
+            sum[i * 2 + j]     = x1[j] * x2[j];
+#else
+          _mm_store_pd(&sum[i * 2], _mm_mul_pd( _mm_load_pd(x1), _mm_load_pd(x2)));   
+#endif
+        }
+      break;
+    default:
+      assert(0);
+    }
+}
+
+
+static void sumCAT_SAVE(int tipCase, double *sum, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  int i;
+  double 
+    *x1, 
+    *x2,    
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      for (i = 0; i < n; i++)
+      {
+        x1 = &(tipVector[4 * tipX1[i]]);
+        x2 = &(tipVector[4 * tipX2[i]]);
+
+        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
+        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+      }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+      {
+        x1 = &(tipVector[4 * tipX1[i]]);
+        if(isGap(x2_gap, i))
+          x2 = x2_gapColumn;
+        else
+        {
+          x2 = x2_ptr;
+          x2_ptr += 4;
+        }
+
+        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
+        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        if(isGap(x1_gap, i))
+          x1 = x1_gapColumn;
+        else
+        {
+          x1 = x1_ptr;
+          x1_ptr += 4;
+        }
+
+        if(isGap(x2_gap, i))
+          x2 = x2_gapColumn;
+        else
+        {
+          x2 = x2_ptr;
+          x2_ptr += 4;
+        }
+
+        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
+        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+
+      }    
+      break;
+    default:
+      assert(0);
+  }
+}
+
+static void sumGAMMA_BINARY(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+                            unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+  double *x1, *x2, *sum;
+  int i, j;
+#if (!defined(_USE_PTHREADS) && !defined(_FINE_GRAIN_MPI))
+  int k;
+#endif
+
+  /* C-OPT once again switch over possible configurations at inner node */
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      /* C-OPT main for loop overt alignment length */
+      for (i = 0; i < n; i++)
+        {
+          x1 = &(tipVector[2 * tipX1[i]]);
+          x2 = &(tipVector[2 * tipX2[i]]);
+          sum = &sumtable[i * 8];
+#if (!defined(_USE_PTHREADS) && !defined(_FINE_GRAIN_MPI))
+          for(j = 0; j < 4; j++)
+            for(k = 0; k < 2; k++)
+              sum[j * 2 + k] = x1[k] * x2[k];
+#else
+          for(j = 0; j < 4; j++)
+            _mm_store_pd( &sum[j*2], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));         
+#endif
+        }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+        {
+          x1  = &(tipVector[2 * tipX1[i]]);
+          x2  = &x2_start[8 * i];
+          sum = &sumtable[8 * i];
+
+#if (!defined(_USE_PTHREADS) && !defined(_FINE_GRAIN_MPI))
+          for(j = 0; j < 4; j++)
+            for(k = 0; k < 2; k++)
+              sum[j * 2 + k] = x1[k] * x2[j * 2 + k];
+#else
+          for(j = 0; j < 4; j++)
+            _mm_store_pd( &sum[j*2], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[j * 2] )));
+#endif
+        }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+        {
+          x1  = &x1_start[8 * i];
+          x2  = &x2_start[8 * i];
+          sum = &sumtable[8 * i];
+#if (!defined(_USE_PTHREADS) && !defined(_FINE_GRAIN_MPI))
+          for(j = 0; j < 4; j++)
+            for(k = 0; k < 2; k++)
+              sum[j * 2 + k] = x1[j * 2 + k] * x2[j * 2 + k];
+#else
+          for(j = 0; j < 4; j++)
+            _mm_store_pd( &sum[j*2], _mm_mul_pd( _mm_load_pd( &x1[j * 2] ), _mm_load_pd( &x2[j * 2] )));
+#endif
+        }
+      break;
+    default:
+      assert(0);
+    }
+}
+
+
+static void sumGAMMA_GAPPED_SAVE(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, 
+    double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  double 
+    *x1, 
+    *x2, 
+    *sum,
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start;
+
+  int i, j, k; 
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:     
+      for (i = 0; i < n; i++)
+      {
+        x1 = &(tipVector[4 * tipX1[i]]);
+        x2 = &(tipVector[4 * tipX2[i]]);
+        sum = &sumtable[i * 16];
+
+        for(j = 0; j < 4; j++)      
+          for(k = 0; k < 4; k+=2)
+            _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[k] )));
+      }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+      {
+        x1  = &(tipVector[4 * tipX1[i]]);
+
+        if(x2_gap[i / 32] & mask32[i % 32])
+          x2 = x2_gapColumn;
+        else
+        {
+          x2  = x2_ptr;
+          x2_ptr += 16;
+        }
+
+        sum = &sumtable[16 * i];
+
+        for(j = 0; j < 4; j++)      
+          for(k = 0; k < 4; k+=2)
+            _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[j * 4 + k] )));
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        if(x1_gap[i / 32] & mask32[i % 32])
+          x1 = x1_gapColumn;
+        else
+        {
+          x1  = x1_ptr;
+          x1_ptr += 16;
+        }
+
+        if(x2_gap[i / 32] & mask32[i % 32])
+          x2 = x2_gapColumn;
+        else
+        {
+          x2  = x2_ptr;
+          x2_ptr += 16;
+        }
+
+        sum = &sumtable[16 * i];
+
+
+        for(j = 0; j < 4; j++)      
+          for(k = 0; k < 4; k+=2)
+            _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[j * 4 + k] ), _mm_load_pd( &x2[j * 4 + k] )));
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
+
+
+static void sumGAMMA(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+  double *x1, *x2, *sum;
+  int i, j, k;
+
+  /* C-OPT once again switch over possible configurations at inner node */
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      /* C-OPT main for loop overt alignment length */
+      for (i = 0; i < n; i++)
+      {
+        x1 = &(tipVector[4 * tipX1[i]]);
+        x2 = &(tipVector[4 * tipX2[i]]);
+        sum = &sumtable[i * 16];
+
+        for(j = 0; j < 4; j++)      
+          for(k = 0; k < 4; k+=2)
+            _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[k] )));
+      }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+      {
+        x1  = &(tipVector[4 * tipX1[i]]);
+        x2  = &x2_start[16 * i];
+        sum = &sumtable[16 * i];
+
+        for(j = 0; j < 4; j++)      
+          for(k = 0; k < 4; k+=2)
+            _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[j * 4 + k] )));
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        x1  = &x1_start[16 * i];
+        x2  = &x2_start[16 * i];
+        sum = &sumtable[16 * i];
+
+        for(j = 0; j < 4; j++)      
+          for(k = 0; k < 4; k+=2)
+            _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[j * 4 + k] ), _mm_load_pd( &x2[j * 4 + k] )));
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
+static void sumCAT(int tipCase, double *sum, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+  int i;
+  double 
+    *x1, 
+    *x2;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      for (i = 0; i < n; i++)
+      {
+        x1 = &(tipVector[4 * tipX1[i]]);
+        x2 = &(tipVector[4 * tipX2[i]]);
+
+        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
+        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+      }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+      {
+        x1 = &(tipVector[4 * tipX1[i]]);
+        x2 = &x2_start[4 * i];
+
+        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
+        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        x1 = &x1_start[4 * i];
+        x2 = &x2_start[4 * i];
+
+        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
+        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+
+      }    
+      break;
+    default:
+      assert(0);
+  }
+}
+static void sumGAMMAPROT_GAPPED_SAVE(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, 
+    double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  int i, l, k;
+  double 
+    *left, 
+    *right, 
+    *sum,
+    *x1_ptr = x1,
+    *x2_ptr = x2,
+    *x1v,
+    *x2v;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      for(i = 0; i < n; i++)
+      {
+        left  = &(tipVector[20 * tipX1[i]]);
+        right = &(tipVector[20 * tipX2[i]]);
+
+        for(l = 0; l < 4; l++)
+        {
+          sum = &sumtable[i * 80 + l * 20];
+
+          for(k = 0; k < 20; k+=2)
+          {
+            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+
+            _mm_store_pd(&sum[k], sumv);                 
+          }
+
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      for(i = 0; i < n; i++)
+      {
+        left = &(tipVector[20 * tipX1[i]]);
+
+        if(x2_gap[i / 32] & mask32[i % 32])
+          x2v = x2_gapColumn;
+        else
+        {
+          x2v = x2_ptr;
+          x2_ptr += 80;
+        }
+
+        for(l = 0; l < 4; l++)
+        {
+          right = &(x2v[l * 20]);
+          sum = &sumtable[i * 80 + l * 20];
+
+          for(k = 0; k < 20; k+=2)
+          {
+            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+
+            _mm_store_pd(&sum[k], sumv);                 
+          }
+        }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+      {
+        if(x1_gap[i / 32] & mask32[i % 32])
+          x1v = x1_gapColumn;
+        else
+        {
+          x1v  = x1_ptr;
+          x1_ptr += 80;
+        }
+
+        if(x2_gap[i / 32] & mask32[i % 32])
+          x2v = x2_gapColumn;
+        else
+        {
+          x2v  = x2_ptr;
+          x2_ptr += 80;
+        }
+
+        for(l = 0; l < 4; l++)
+        {
+          left  = &(x1v[l * 20]);
+          right = &(x2v[l * 20]);
+          sum   = &(sumtable[i * 80 + l * 20]);
+
+          for(k = 0; k < 20; k+=2)
+          {
+            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+
+            _mm_store_pd(&sum[k], sumv);                 
+          }
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
+static void sumGAMMAPROT_LG4(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector[4],
+                             unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+  int i, l, k;
+  double *left, *right, *sum;
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      for(i = 0; i < n; i++)
+        {         
+          for(l = 0; l < 4; l++)
+            {
+              left  = &(tipVector[l][20 * tipX1[i]]);
+              right = &(tipVector[l][20 * tipX2[i]]);
+
+              sum = &sumtable[i * 80 + l * 20];
+#ifdef __SSE3
+              for(k = 0; k < 20; k+=2)
+                {
+                  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+                  
+                  _mm_store_pd(&sum[k], sumv);           
+                }
+#else
+              for(k = 0; k < 20; k++)
+                sum[k] = left[k] * right[k];
+#endif
+            }
+        }
+      break;
+    case PLL_TIP_INNER:
+      for(i = 0; i < n; i++)
+        {
+         
+
+          for(l = 0; l < 4; l++)
+            { 
+              left = &(tipVector[l][20 * tipX1[i]]);
+              right = &(x2[80 * i + l * 20]);
+              sum = &sumtable[i * 80 + l * 20];
+#ifdef __SSE3
+              for(k = 0; k < 20; k+=2)
+                {
+                  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+                  
+                  _mm_store_pd(&sum[k], sumv);           
+                }
+#else
+              for(k = 0; k < 20; k++)
+                sum[k] = left[k] * right[k];
+#endif
+            }
+        }
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+        {
+          for(l = 0; l < 4; l++)
+            {
+              left  = &(x1[80 * i + l * 20]);
+              right = &(x2[80 * i + l * 20]);
+              sum   = &(sumtable[i * 80 + l * 20]);
+
+#ifdef __SSE3
+              for(k = 0; k < 20; k+=2)
+                {
+                  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+                  
+                  _mm_store_pd(&sum[k], sumv);           
+                }
+#else
+              for(k = 0; k < 20; k++)
+                sum[k] = left[k] * right[k];
+#endif
+            }
+        }
+      break;
+    default:
+      assert(0);
+    }
+}
+
+
+static void sumGAMMAPROT(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+  int i, l, k;
+  double *left, *right, *sum;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      for(i = 0; i < n; i++)
+      {
+        left  = &(tipVector[20 * tipX1[i]]);
+        right = &(tipVector[20 * tipX2[i]]);
+
+        for(l = 0; l < 4; l++)
+        {
+          sum = &sumtable[i * 80 + l * 20];
+
+          for(k = 0; k < 20; k+=2)
+          {
+            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+
+            _mm_store_pd(&sum[k], sumv);                 
+          }
+
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      for(i = 0; i < n; i++)
+      {
+        left = &(tipVector[20 * tipX1[i]]);
+
+        for(l = 0; l < 4; l++)
+        {
+          right = &(x2[80 * i + l * 20]);
+          sum = &sumtable[i * 80 + l * 20];
+
+          for(k = 0; k < 20; k+=2)
+          {
+            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+
+            _mm_store_pd(&sum[k], sumv);                 
+          }
+
+        }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+      {
+        for(l = 0; l < 4; l++)
+        {
+          left  = &(x1[80 * i + l * 20]);
+          right = &(x2[80 * i + l * 20]);
+          sum   = &(sumtable[i * 80 + l * 20]);
+
+
+          for(k = 0; k < 20; k+=2)
+          {
+            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+
+            _mm_store_pd(&sum[k], sumv);                 
+          }
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
+static void sumGTRCATPROT(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+  int i, l;
+  double *sum, *left, *right;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      for (i = 0; i < n; i++)
+      {
+        left  = &(tipVector[20 * tipX1[i]]);
+        right = &(tipVector[20 * tipX2[i]]);
+        sum = &sumtable[20 * i];
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+
+          _mm_store_pd(&sum[l], sumv);           
+        }
+
+      }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+      {
+        left = &(tipVector[20 * tipX1[i]]);
+        right = &x2[20 * i];
+        sum = &sumtable[20 * i];
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+
+          _mm_store_pd(&sum[l], sumv);           
+        }
+
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        left  = &x1[20 * i];
+        right = &x2[20 * i];
+        sum = &sumtable[20 * i];
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+
+          _mm_store_pd(&sum[l], sumv);           
+        }
+
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
+static void sumGTRCATPROT_SAVE(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, 
+    double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  int 
+    i, 
+    l;
+
+  double 
+    *sum, 
+    *left, 
+    *right,
+    *left_ptr = x1,
+    *right_ptr = x2;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      for (i = 0; i < n; i++)
+      {
+        left  = &(tipVector[20 * tipX1[i]]);
+        right = &(tipVector[20 * tipX2[i]]);
+        sum = &sumtable[20 * i];
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+
+          _mm_store_pd(&sum[l], sumv);           
+        }
+
+      }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+      {
+        left = &(tipVector[20 * tipX1[i]]);       
+
+        if(isGap(x2_gap, i))
+          right = x2_gapColumn;
+        else
+        {
+          right = right_ptr;
+          right_ptr += 20;
+        }
+
+        sum = &sumtable[20 * i];
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+
+          _mm_store_pd(&sum[l], sumv);           
+        }
+
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {  
+        if(isGap(x1_gap, i))
+          left = x1_gapColumn;
+        else
+        {
+          left = left_ptr;
+          left_ptr += 20;
+        }
+
+        if(isGap(x2_gap, i))
+          right = x2_gapColumn;
+        else
+        {
+          right = right_ptr;
+          right_ptr += 20;
+        }
+
+        sum = &sumtable[20 * i];
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+
+          _mm_store_pd(&sum[l], sumv);           
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+
+static void coreGTRGAMMA(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN, double *gammaRates, double lz, int *wrptr)
+{
+  double 
+    dlnLdlz = 0.0,
+            d2lnLdlz2 = 0.0,
+            ki, 
+            kisqr,  
+            inv_Li, 
+            dlnLidlz, 
+            d2lnLidlz2,  
+		*sum;
+	PLL_ALIGN_BEGIN double
+            diagptable0[16] PLL_ALIGN_END,
+            diagptable1[16] PLL_ALIGN_END,
+            diagptable2[16] PLL_ALIGN_END;
+
+  int     
+    i, 
+    j, 
+    l;
+
+  for(i = 0; i < 4; i++)
+  {
+    ki = gammaRates[i];
+    kisqr = ki * ki;
+
+    diagptable0[i * 4] = 1.0;
+    diagptable1[i * 4] = 0.0;
+    diagptable2[i * 4] = 0.0;
+
+    for(l = 1; l < 4; l++)
+    {
+      diagptable0[i * 4 + l] = exp(EIGN[l] * ki * lz);
+      diagptable1[i * 4 + l] = EIGN[l] * ki;
+      diagptable2[i * 4 + l] = EIGN[l] * EIGN[l] * kisqr;
+    }
+  }
+
+  for (i = 0; i < upper; i++)
+  { 
+    __m128d a0 = _mm_setzero_pd();
+    __m128d a1 = _mm_setzero_pd();
+    __m128d a2 = _mm_setzero_pd();
+
+    sum = &sumtable[i * 16];         
+
+    for(j = 0; j < 4; j++)
+    {                   
+      double       
+        *d0 = &diagptable0[j * 4],
+        *d1 = &diagptable1[j * 4],
+        *d2 = &diagptable2[j * 4];
+
+      for(l = 0; l < 4; l+=2)
+      {
+        __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d0[l]), _mm_load_pd(&sum[j * 4 + l]));
+        a0 = _mm_add_pd(a0, tmpv);
+        a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(&d1[l])));
+        a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(&d2[l])));
+      }           
+    }
+
+    a0 = _mm_hadd_pd(a0, a0);
+    a1 = _mm_hadd_pd(a1, a1);
+    a2 = _mm_hadd_pd(a2, a2);
+
+    _mm_storel_pd(&inv_Li, a0);     
+    _mm_storel_pd(&dlnLidlz, a1);
+    _mm_storel_pd(&d2lnLidlz2, a2); 
+
+    inv_Li = 1.0 / fabs (inv_Li);
+
+    dlnLidlz   *= inv_Li;
+    d2lnLidlz2 *= inv_Li;     
+
+    dlnLdlz   += wrptr[i] * dlnLidlz;
+    d2lnLdlz2 += wrptr[i] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+  }
+
+
+  *ext_dlnLdlz   = dlnLdlz;
+  *ext_d2lnLdlz2 = d2lnLdlz2; 
+}
+
+static void coreGTRCAT_BINARY(int upper, int numberOfCategories, double *sum,
+                              volatile double *d1, volatile double *d2, 
+                              double *rptr, double *EIGN, int *cptr, double lz, int *wgt)
+{
+  int i;
+  double
+    *d, *d_start = NULL,
+    tmp_0, inv_Li, dlnLidlz, d2lnLidlz2,
+    dlnLdlz = 0.0,
+    d2lnLdlz2 = 0.0;
+  double e[2];
+  double dd1;
+
+  e[0] = EIGN[0];
+  e[1] = EIGN[0] * EIGN[0];
+
+
+  d = d_start = (double *)rax_malloc(numberOfCategories * sizeof(double));
+
+  dd1 = e[0] * lz;
+
+  for(i = 0; i < numberOfCategories; i++)
+    d[i] = exp(dd1 * rptr[i]);
+
+  for (i = 0; i < upper; i++)
+    {
+      double
+        r = rptr[cptr[i]],
+        wr1 = r * wgt[i],
+        wr2 = r * r * wgt[i];
+      
+      d = &d_start[cptr[i]];
+
+      inv_Li = sum[2 * i];
+      inv_Li += (tmp_0 = d[0] * sum[2 * i + 1]);
+
+      inv_Li = 1.0/fabs(inv_Li);
+
+      dlnLidlz   = tmp_0 * e[0];
+      d2lnLidlz2 = tmp_0 * e[1];
+
+      dlnLidlz   *= inv_Li;
+      d2lnLidlz2 *= inv_Li;
+
+      dlnLdlz   += wr1 * dlnLidlz;
+      d2lnLdlz2 += wr2 * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+    }
+
+  *d1 = dlnLdlz;
+  *d2 = d2lnLdlz2;
+
+  rax_free(d_start);
+}
+
+
+static void coreGTRCAT(int upper, int numberOfCategories, double *sum,
+    volatile double *d1, volatile double *d2, int *wgt,
+    double *rptr, double *EIGN, int *cptr, double lz)
+{
+  int i;
+  double
+    *d, *d_start = NULL,
+    inv_Li, dlnLidlz, d2lnLidlz2,
+    dlnLdlz = 0.0,
+    d2lnLdlz2 = 0.0;
+
+  PLL_ALIGN_BEGIN double e1[4] PLL_ALIGN_END;
+  PLL_ALIGN_BEGIN double e2[4] PLL_ALIGN_END;
+  double dd1, dd2, dd3;
+
+  __m128d
+    e1v[2],
+    e2v[2];
+
+  e1[0] = 0.0;
+  e2[0] = 0.0;
+  e1[1] = EIGN[1];
+  e2[1] = EIGN[1] * EIGN[1];
+  e1[2] = EIGN[2];
+  e2[2] = EIGN[2] * EIGN[2];
+  e1[3] = EIGN[3];
+  e2[3] = EIGN[3] * EIGN[3];
+
+  e1v[0]= _mm_load_pd(&e1[0]);
+  e1v[1]= _mm_load_pd(&e1[2]);
+
+  e2v[0]= _mm_load_pd(&e2[0]);
+  e2v[1]= _mm_load_pd(&e2[2]);
+
+  rax_posix_memalign ((void **) &d_start, PLL_BYTE_ALIGNMENT, numberOfCategories * 4 * sizeof(double));
+  d = d_start;
+
+  dd1 = EIGN[1] * lz;
+  dd2 = EIGN[2] * lz;
+  dd3 = EIGN[3] * lz;
+
+  for(i = 0; i < numberOfCategories; i++)
+  {
+    d[i * 4 + 0] = 1.0;
+    d[i * 4 + 1] = exp(dd1 * rptr[i]);
+    d[i * 4 + 2] = exp(dd2 * rptr[i]);
+    d[i * 4 + 3] = exp(dd3 * rptr[i]);
+  }
+
+  for (i = 0; i < upper; i++)
+  {
+    double *s = &sum[4 * i];
+    d = &d_start[4 * cptr[i]];  
+
+    __m128d tmp_0v =_mm_mul_pd(_mm_load_pd(&d[0]),_mm_load_pd(&s[0]));
+    __m128d tmp_1v =_mm_mul_pd(_mm_load_pd(&d[2]),_mm_load_pd(&s[2]));
+
+    __m128d inv_Liv    = _mm_add_pd(tmp_0v, tmp_1v);      
+
+    __m128d dlnLidlzv   = _mm_add_pd(_mm_mul_pd(tmp_0v, e1v[0]), _mm_mul_pd(tmp_1v, e1v[1]));     
+    __m128d d2lnLidlz2v = _mm_add_pd(_mm_mul_pd(tmp_0v, e2v[0]), _mm_mul_pd(tmp_1v, e2v[1]));
+
+
+    inv_Liv   = _mm_hadd_pd(inv_Liv, inv_Liv);
+    dlnLidlzv = _mm_hadd_pd(dlnLidlzv, dlnLidlzv);
+    d2lnLidlz2v = _mm_hadd_pd(d2lnLidlz2v, d2lnLidlz2v);                 
+
+    _mm_storel_pd(&inv_Li, inv_Liv);     
+    _mm_storel_pd(&dlnLidlz, dlnLidlzv);                 
+    _mm_storel_pd(&d2lnLidlz2, d2lnLidlz2v);      
+
+    inv_Li = 1.0 / fabs (inv_Li);
+
+    dlnLidlz   *= inv_Li;
+    d2lnLidlz2 *= inv_Li;
+
+    dlnLdlz  += wgt[i] * rptr[cptr[i]] * dlnLidlz;
+    d2lnLdlz2 += wgt[i] * rptr[cptr[i]] * rptr[cptr[i]] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+  }
+
+  *d1 = dlnLdlz;
+  *d2 = d2lnLdlz2;
+
+  rax_free(d_start);
+}
+
+#if (!defined(__SSE3) && !defined(__AVX))
+static void coreGTRGAMMA_BINARY(const int upper, double *sumtable,
+                                volatile double *d1,   volatile double *d2, double *EIGN, double *gammaRates, double lz, int *wrptr)
+{
+  int i, j;
+  double
+    *diagptable, *diagptable_start, *sum,
+    tmp_1, inv_Li, dlnLidlz, d2lnLidlz2, ki, kisqr,
+    dlnLdlz = 0.0,
+    d2lnLdlz2 = 0.0;
+
+  diagptable = diagptable_start = (double *)rax_malloc(sizeof(double) * 12);
+
+  for(i = 0; i < 4; i++)
+    {
+      ki = gammaRates[i];
+      kisqr = ki * ki;
+
+      diagptable[i * 3]     = exp (EIGN[1] * ki * lz);
+      diagptable[i * 3 + 1] = EIGN[1] * ki;
+      diagptable[i * 3 + 2] = EIGN[1] * EIGN[1] * kisqr;
+    }
+
+  for (i = 0; i < upper; i++)
+    {
+      diagptable = diagptable_start;
+      sum = &(sumtable[i * 8]);
+
+      inv_Li      = 0.0;
+      dlnLidlz    = 0.0;
+      d2lnLidlz2  = 0.0;
+
+      for(j = 0; j < 4; j++)
+        {
+          inv_Li += sum[2 * j];
+
+          tmp_1      =  diagptable[3 * j] * sum[2 * j + 1];
+          inv_Li     += tmp_1;
+          dlnLidlz   += tmp_1 * diagptable[3 * j + 1];
+          d2lnLidlz2 += tmp_1 * diagptable[3 * j + 2];
+        }
+
+      inv_Li = 1.0 / fabs(inv_Li);
+
+      dlnLidlz   *= inv_Li;
+      d2lnLidlz2 *= inv_Li;
+
+
+      dlnLdlz  += wrptr[i] * dlnLidlz;
+      d2lnLdlz2 += wrptr[i] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+    }
+
+  *d1 = dlnLdlz;
+  *d2 = d2lnLdlz2;
+
+  rax_free(diagptable_start);
+}
+#else
+static void coreGTRGAMMA_BINARY(const int upper, double *sumtable,
+                                volatile double *d1,   volatile double *d2, double *EIGN, double *gammaRates, double lz, int *wrptr)
+{
+	double
+		dlnLdlz = 0.0,
+		d2lnLdlz2 = 0.0,
+		ki,
+		kisqr,
+		inv_Li,
+		dlnLidlz,
+		d2lnLidlz2,
+		*sum;
+	PLL_ALIGN_BEGIN double
+		diagptable0[8] PLL_ALIGN_END,
+		diagptable1[8] PLL_ALIGN_END,
+		diagptable2[8] PLL_ALIGN_END;
+    
+  int     
+    i, 
+    j;
+  
+  for(i = 0; i < 4; i++)
+    {
+      ki = gammaRates[i];
+      kisqr = ki * ki;
+      
+      diagptable0[i * 2] = 1.0;
+      diagptable1[i * 2] = 0.0;
+      diagptable2[i * 2] = 0.0;
+     
+      diagptable0[i * 2 + 1] = exp(EIGN[0] * ki * lz);
+      diagptable1[i * 2 + 1] = EIGN[0] * ki;
+      diagptable2[i * 2 + 1] = EIGN[0] * EIGN[0] * kisqr;    
+    }
+
+  for (i = 0; i < upper; i++)
+    { 
+      __m128d a0 = _mm_setzero_pd();
+      __m128d a1 = _mm_setzero_pd();
+      __m128d a2 = _mm_setzero_pd();
+
+      sum = &sumtable[i * 8];         
+
+      for(j = 0; j < 4; j++)
+        {                       
+          double           
+            *d0 = &diagptable0[j * 2],
+            *d1 = &diagptable1[j * 2],
+            *d2 = &diagptable2[j * 2];
+                         
+          __m128d tmpv = _mm_mul_pd(_mm_load_pd(d0), _mm_load_pd(&sum[j * 2]));
+          a0 = _mm_add_pd(a0, tmpv);
+          a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(d1)));
+          a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(d2)));
+                          
+        }
+
+      a0 = _mm_hadd_pd(a0, a0);
+      a1 = _mm_hadd_pd(a1, a1);
+      a2 = _mm_hadd_pd(a2, a2);
+
+      _mm_storel_pd(&inv_Li, a0);     
+      _mm_storel_pd(&dlnLidlz, a1);
+      _mm_storel_pd(&d2lnLidlz2, a2); 
+
+      inv_Li = 1.0 / fabs(inv_Li);
+     
+      dlnLidlz   *= inv_Li;
+      d2lnLidlz2 *= inv_Li;     
+
+      dlnLdlz   += wrptr[i] * dlnLidlz;
+      d2lnLdlz2 += wrptr[i] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+    }
+
+ 
+  *d1   = dlnLdlz;
+  *d2 = d2lnLdlz2; 
+}
+
+
+#endif
+
+static void coreGTRGAMMAPROT_LG4(double *gammaRates, double *EIGN[4], double *sumtable, int upper, int *wrptr,
+                                 volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double lz,
+                                 double * lg4_weights)
+{
+	double  *sum;
+	PLL_ALIGN_BEGIN double
+    diagptable0[80] PLL_ALIGN_END,
+    diagptable1[80] PLL_ALIGN_END,
+    diagptable2[80] PLL_ALIGN_END;    
+  int     i, j, l;
+  double  dlnLdlz = 0;
+  double d2lnLdlz2 = 0;
+  double ki, kisqr; 
+
+  for(i = 0; i < 4; i++)
+    {
+      ki = gammaRates[i];
+      kisqr = ki * ki;
+      
+      diagptable0[i * 20] = 1.0;
+      diagptable1[i * 20] = 0.0;
+      diagptable2[i * 20] = 0.0;
+
+      for(l = 1; l < 20; l++)
+        {
+          diagptable0[i * 20 + l] = exp(EIGN[i][l] * ki * lz);
+          diagptable1[i * 20 + l] = EIGN[i][l] * ki;
+          diagptable2[i * 20 + l] = EIGN[i][l] * EIGN[i][l] * kisqr;
+        }
+    }
+
+  for (i = 0; i < upper; i++)
+    { 
+
+      double
+      	  inv_Li = 0.0,
+      	  dlnLidlz = 0.0,
+      	  d2lnLidlz2 = 0.0;
+
+      sum = &sumtable[i * 80];         
+
+      for(j = 0; j < 4; j++)
+        {                       
+          double
+          	l0,
+          	l1,
+          	l2,
+            *d0 = &diagptable0[j * 20],
+            *d1 = &diagptable1[j * 20],
+            *d2 = &diagptable2[j * 20];
+                 
+          __m128d a0 = _mm_setzero_pd();
+          __m128d a1 = _mm_setzero_pd();
+          __m128d a2 = _mm_setzero_pd();
+
+          for(l = 0; l < 20; l+=2)
+            {
+              __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d0[l]), _mm_load_pd(&sum[j * 20 +l]));
+              a0 = _mm_add_pd(a0, tmpv);
+              a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(&d1[l])));
+              a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(&d2[l])));
+            }             
+
+          a0 = _mm_hadd_pd(a0, a0);
+      	  a1 = _mm_hadd_pd(a1, a1);
+      	  a2 = _mm_hadd_pd(a2, a2);
+
+      	 _mm_storel_pd(&l0, a0);
+      	 _mm_storel_pd(&l1, a1);
+      	 _mm_storel_pd(&l2, a2);
+
+      	 inv_Li     += lg4_weights[j] * l0;
+      	 dlnLidlz   += lg4_weights[j] * l1;
+     	 d2lnLidlz2 += lg4_weights[j] * l2;
+      }
+
+      inv_Li = 1.0 / fabs (inv_Li);
+
+      dlnLidlz   *= inv_Li;
+      d2lnLidlz2 *= inv_Li;
+
+      dlnLdlz   += wrptr[i] * dlnLidlz;
+      d2lnLdlz2 += wrptr[i] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+    }
+
+  *ext_dlnLdlz   = dlnLdlz;
+  *ext_d2lnLdlz2 = d2lnLdlz2;
+}
+
+
+
+static void coreGTRGAMMAPROT(double *gammaRates, double *EIGN, double *sumtable, int upper, int *wrptr,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double lz)
+{
+	double  *sum;
+	PLL_ALIGN_BEGIN double
+		diagptable0[80] PLL_ALIGN_END,
+		diagptable1[80] PLL_ALIGN_END,
+		diagptable2[80] PLL_ALIGN_END;
+
+  int     i, j, l;
+  double  dlnLdlz = 0;
+  double d2lnLdlz2 = 0;
+  double ki, kisqr; 
+  double inv_Li, dlnLidlz, d2lnLidlz2;
+
+  for(i = 0; i < 4; i++)
+  {
+    ki = gammaRates[i];
+    kisqr = ki * ki;
+
+    diagptable0[i * 20] = 1.0;
+    diagptable1[i * 20] = 0.0;
+    diagptable2[i * 20] = 0.0;
+
+    for(l = 1; l < 20; l++)
+    {
+      diagptable0[i * 20 + l] = exp(EIGN[l] * ki * lz);
+      diagptable1[i * 20 + l] = EIGN[l] * ki;
+      diagptable2[i * 20 + l] = EIGN[l] * EIGN[l] * kisqr;
+    }
+  }
+
+  for (i = 0; i < upper; i++)
+  { 
+    __m128d a0 = _mm_setzero_pd();
+    __m128d a1 = _mm_setzero_pd();
+    __m128d a2 = _mm_setzero_pd();
+
+    sum = &sumtable[i * 80];         
+
+    for(j = 0; j < 4; j++)
+    {                   
+      double       
+        *d0 = &diagptable0[j * 20],
+        *d1 = &diagptable1[j * 20],
+        *d2 = &diagptable2[j * 20];
+
+      for(l = 0; l < 20; l+=2)
+      {
+        __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d0[l]), _mm_load_pd(&sum[j * 20 +l]));
+        a0 = _mm_add_pd(a0, tmpv);
+        a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(&d1[l])));
+        a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(&d2[l])));
+      }           
+    }
+
+    a0 = _mm_hadd_pd(a0, a0);
+    a1 = _mm_hadd_pd(a1, a1);
+    a2 = _mm_hadd_pd(a2, a2);
+
+    _mm_storel_pd(&inv_Li, a0);
+    _mm_storel_pd(&dlnLidlz, a1);
+    _mm_storel_pd(&d2lnLidlz2, a2);
+
+    inv_Li = 1.0 / fabs (inv_Li);
+
+    dlnLidlz   *= inv_Li;
+    d2lnLidlz2 *= inv_Li;
+
+    dlnLdlz   += wrptr[i] * dlnLidlz;
+    d2lnLdlz2 += wrptr[i] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+  }
+
+  *ext_dlnLdlz   = dlnLdlz;
+  *ext_d2lnLdlz2 = d2lnLdlz2;
+}
+
+
+
+static void coreGTRCATPROT(double *EIGN, double lz, int numberOfCategories, double *rptr, int *cptr, int upper,
+    int *wgt, volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *sumtable)
+{
+  int i, l;
+  double *d1, *d_start = NULL, *sum;
+  PLL_ALIGN_BEGIN double 
+    e[20] PLL_ALIGN_END, 
+    s[20] PLL_ALIGN_END, 
+    dd[20] PLL_ALIGN_END;
+  double inv_Li, dlnLidlz, d2lnLidlz2;
+  double  dlnLdlz = 0.0;
+  double  d2lnLdlz2 = 0.0;
+
+  rax_posix_memalign ((void **)&d_start, PLL_BYTE_ALIGNMENT, numberOfCategories * 20 * sizeof(double));
+  d1 = d_start; 
+
+  e[0] = 0.0;
+  s[0] = 0.0; 
+
+  for(l = 1; l < 20; l++)
+  {
+    e[l]  = EIGN[l] * EIGN[l];
+    s[l]  = EIGN[l];
+    dd[l] = s[l] * lz;
+  }
+
+  for(i = 0; i < numberOfCategories; i++)
+  {      
+    d1[20 * i] = 1.0;
+    for(l = 1; l < 20; l++)
+      d1[20 * i + l] = exp(dd[l] * rptr[i]);
+  }
+
+  for (i = 0; i < upper; i++)
+  {
+    __m128d a0 = _mm_setzero_pd();
+    __m128d a1 = _mm_setzero_pd();
+    __m128d a2 = _mm_setzero_pd();
+
+    d1 = &d_start[20 * cptr[i]];
+    sum = &sumtable[20 * i];
+
+    for(l = 0; l < 20; l+=2)
+    {     
+      __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d1[l]), _mm_load_pd(&sum[l]));
+
+      a0 = _mm_add_pd(a0, tmpv);
+      __m128d sv = _mm_load_pd(&s[l]);    
+
+      a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, sv));
+      __m128d ev = _mm_load_pd(&e[l]);    
+
+      a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, ev));
+    }
+
+    a0 = _mm_hadd_pd(a0, a0);
+    a1 = _mm_hadd_pd(a1, a1);
+    a2 = _mm_hadd_pd(a2, a2);
+
+    _mm_storel_pd(&inv_Li, a0);     
+    _mm_storel_pd(&dlnLidlz, a1);                 
+    _mm_storel_pd(&d2lnLidlz2, a2);
+
+    inv_Li = 1.0 / fabs (inv_Li);
+
+    dlnLidlz   *= inv_Li;
+    d2lnLidlz2 *= inv_Li;
+
+    dlnLdlz  += wgt[i] * rptr[cptr[i]] * dlnLidlz;
+    d2lnLdlz2 += wgt[i] * rptr[cptr[i]] * rptr[cptr[i]] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+  }
+
+  *ext_dlnLdlz   = dlnLdlz;
+  *ext_d2lnLdlz2 = d2lnLdlz2;
+
+  rax_free(d_start);
+}
+
+
+
+
+#endif
+
+
+
diff --git a/pll/mem_alloc.c b/pll/mem_alloc.c
new file mode 100644
index 0000000..68e928d
--- /dev/null
+++ b/pll/mem_alloc.c
@@ -0,0 +1,228 @@
+
+#define MEM_ALLOC_NO_GUARDS 1
+
+#include "mem_alloc.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifndef __APPLE__
+#include <malloc.h>             // this is probably not necessary
+#endif
+
+#ifdef RAXML_USE_LLALLOC
+
+// the llalloc library implementation in lockless_alloc/ll_alloc.c exports the alloction functions prefixed
+// with 'llalloc'. The following are the forward declarations of the llalloc* functions 
+
+#define PREFIX(X)   llalloc##X
+
+void *PREFIX(memalign)(size_t align, size_t size);
+void *PREFIX(malloc)(size_t size);
+void *PREFIX(realloc)(void *p, size_t size);
+int PREFIX(posix_memalign)(void **p, size_t align, size_t size);
+void *PREFIX(calloc)(size_t n, size_t size);
+void PREFIX(free)(void *p);
+
+
+// wrappers that forward the rax_* functions to the corresponding llalloc* functions
+
+
+void *rax_memalign(size_t align, size_t size) {
+  return PREFIX(memalign)(align, size);
+}
+
+void *rax_malloc( size_t size ) {
+  return PREFIX(malloc)(size);
+}
+void *rax_realloc( void *p, size_t size ) {
+  return PREFIX(realloc)(p, size);
+}
+
+
+void rax_free(void *p) {
+  PREFIX(free)(p);
+}
+
+int rax_posix_memalign(void **p, size_t align, size_t size) {
+  return PREFIX(posix_memalign)(p, align, size);
+}
+void *rax_calloc(size_t n, size_t size) {
+  return PREFIX(calloc)(n,size);
+}
+
+void *rax_malloc_aligned(size_t size) 
+{
+  const size_t PLL_BYTE_ALIGNMENT = 32;
+  return rax_memalign(PLL_BYTE_ALIGNMENT, size);
+  
+}
+
+#else // RAXML_USE_LLALLOC
+// if llalloc should not be used, forward the rax_* functions to the corresponding standard function
+
+void *rax_memalign(size_t align, size_t size) {
+#if defined (__APPLE__)
+    void * mem;
+    if (posix_memalign (&mem, align, size))
+      return (NULL);
+    else
+      return (mem);
+#else
+    return memalign(align, size);
+#endif
+    
+}
+
+void *rax_malloc( size_t size ) {
+  return malloc(size);
+}
+void *rax_realloc( void *p, size_t size ) {
+  return realloc(p, size);
+}
+
+
+void rax_free(void *p) {
+  free(p);
+}
+
+int rax_posix_memalign(void **p, size_t align, size_t size) {
+  return posix_memalign(p, align, size);
+}
+void *rax_calloc(size_t n, size_t size) {
+  return calloc(n,size);
+}
+
+void *rax_malloc_aligned(size_t size) 
+{
+  const size_t PLL_BYTE_ALIGNMENT = 32;
+  return rax_memalign(PLL_BYTE_ALIGNMENT, size);
+  
+}
+
+#endif
+
+
+
+#if 0
+//
+// two test cases to check if the default malloc plays along well with lockless malloc. Normally there shoudl not be a
+// problem as long as everyone handles 'foreign' sbrk calls gracefully (as lockless and glibc seem to do). 
+// WARNING: there is a slightly worrying comment in glibc malloc, which seems to assume that magically no foreign sbrks
+// happen between two consecutive sbrk calls while re-establishing page alignment in some obscure special case. IMHO, this
+// is clearly an error (race) in multithreaded programs, as there is no way how a foreign sbrk user can properly lock anything.
+// see: http://sourceware.org/git/?p=glibc.git;a=blob;f=malloc/malloc.c;h=0f1796c9134ffef289ec31fb1cd538f3a9490ae1;hb=HEAD#l2581
+//
+// If all threads consistently only use the rax_* wrappers this is not a problem, but as this is a library, we can not be sure 
+// that no other thread uses default malloc... note that lockless malloc only uses sbrk for the slab (=small block) area, while 
+// raxml heavy uses malloc/free only on much larger blocks...
+// If anything ever goes wrong while using mixed glibc/lockless malloc, this should be investigated.
+//
+// TODO: the potential race seems to be related to handling the case where a 'foreign sbrk' adjusted the break to a non page-boundary.
+// check if lockless malloc actually ever adjusts to non page-boundaries.
+
+
+void check_block( void *p, size_t size ) {
+    size_t i;
+    char *cp = (char*)p;
+    
+    for( i = 0; i < size; ++i ) {
+        
+        if( cp[i] != (char)i ) {
+            printf( "MEEEEEEEEEEEEEEEEEEEEP\n" );
+            abort();
+        }
+    }
+    
+}
+
+
+void fill_block( void *p, size_t size ) {
+    size_t i;
+    char *cp = (char*)p;
+    
+    for( i = 0; i < size; ++i ) {
+        cp[i] = (char)i;
+    }
+}
+
+
+void malloc_stress() {
+    const int n_slots = 100000;
+    
+    void *blocks1[n_slots];
+    size_t sizes1[n_slots];
+    void *blocks2[n_slots];
+    size_t sizes2[n_slots];
+    
+    memset( blocks1, 0, sizeof( void * ) * n_slots ); 
+    memset( blocks2, 0, sizeof( void * ) * n_slots ); 
+    
+    memset( sizes1, 0, sizeof( size_t ) * n_slots );
+    memset( sizes2, 0, sizeof( size_t ) * n_slots );
+    
+    
+    
+    while( 1 ) {
+        int r = rand() % n_slots;
+        
+        void *bs;
+        
+        
+        int size;
+        if( rand() % 2 == 0 ) {
+            size = rand() % (32 * 16); // hit slab
+        } else {
+            size = (rand() % 128) * 128; // not slab
+        }
+            
+            
+        if( 1 || rand() % 2 == 0 ) {
+            if( blocks1[r] == 0 ) {
+                blocks1[r] = malloc( size );
+                sizes1[r] = size;
+                fill_block( blocks1[r], sizes1[r] );
+            } else {
+                check_block( blocks1[r], sizes1[r] );
+                free( blocks1[r] );
+                blocks1[r] = 0;
+            }
+        } else {
+            if( blocks2[r] == 0 ) {
+                blocks2[r] = rax_malloc( size );
+                sizes2[r] = size;
+                fill_block( blocks2[r], sizes2[r] );
+            } else {
+                check_block( blocks2[r], sizes2[r] );
+                
+                rax_free( blocks2[r] );
+                blocks2[r] = 0;
+            }
+        }
+            
+       
+        
+    }
+    
+}
+
+
+void malloc_stress2() {
+    const size_t n_slots = 1000;
+    
+    void *blocks[n_slots];
+    size_t i;
+    for( i = 0; i < n_slots; ++i ) {
+        blocks[i] = malloc( (rand() % 32) * 1024 ); 
+        
+    }
+    sbrk( 10 );
+    for( i = 0; i < n_slots; ++i ) {
+        free(blocks[i]);
+        
+    }
+    
+    
+    
+}
+#endif
+
diff --git a/pll/mem_alloc.h b/pll/mem_alloc.h
new file mode 100644
index 0000000..0bfa08a
--- /dev/null
+++ b/pll/mem_alloc.h
@@ -0,0 +1,77 @@
+#ifndef __mem_alloc_h
+#define __mem_alloc_h
+
+#if defined WIN32 || defined _WIN32 || defined __WIN32__
+#include <stdlib.h>
+//#include <intrin.h>
+#include <malloc.h>
+//#include <windows.h>
+#endif
+
+#include <stddef.h>
+#include <stdlib.h>
+#ifdef __linux__
+#include <malloc.h>
+#endif
+#include "pll.h"
+#include <string.h>
+
+//#define rax_memalign memalign
+//#define rax_malloc malloc
+//#define rax_calloc calloc
+//#define rax_realloc realloc
+
+
+#if defined WIN32 || defined _WIN32 || defined __WIN32__
+    #if (defined(__MINGW32__) || defined(__clang__)) && defined(BINARY32)
+        #define rax_posix_memalign(ptr,alignment,size) *(ptr) = __mingw_aligned_malloc((size),(alignment))
+        #define rax_malloc(size) __mingw_aligned_malloc((size), PLL_BYTE_ALIGNMENT)
+        void *rax_calloc(size_t count, size_t size);
+        #define rax_free __mingw_aligned_free
+    #else
+        #define rax_posix_memalign(ptr,alignment,size) *(ptr) = _aligned_malloc((size),(alignment))
+        #define rax_malloc(size) _aligned_malloc((size), PLL_BYTE_ALIGNMENT)
+        void *rax_calloc(size_t count, size_t size);
+        #define rax_free _aligned_free
+    #endif
+#else
+    #define rax_posix_memalign posix_memalign
+    #define rax_malloc malloc
+    #define rax_calloc calloc
+    #define rax_free free
+#endif
+
+//#define rax_malloc_aligned(x) memalign(PLL_BYTE_ALIGNMENT,x)
+
+//void *rax_memalign(size_t align, size_t size);
+//void *rax_malloc(size_t size);
+//void *rax_realloc(void *p, size_t size);
+//void rax_free(void *p);
+//int rax_posix_memalign(void **p, size_t align, size_t size);
+//void *rax_calloc(size_t n, size_t size);
+//
+//void *rax_malloc_aligned(size_t size);
+
+
+/* for strndup stuff */
+static __inline char *my_strndup(const char *s, size_t n) {
+	char *ret = (char *) rax_malloc(n+1);
+	strncpy(ret, s, n);
+	ret[n] = 0;
+	return ret;
+}
+
+#if 0
+// using the following contraption to trigger a compile-time error does not work on some gcc versions. It will trigger a confising linker error in the best case, so it is deativated.
+
+#if defined(RAXML_USE_LLALLOC) && !defined(MEM_ALLOC_NO_GUARDS)
+#define malloc(x) XXX_DONT_USE_MALLOC_WITHOUT_RAX_PREFIX_XXX
+#define free(x) XXX_DONT_USE_FREE_WITHOUT_RAX_PREFIX_XXX
+#define calloc(x,y) XXX_DONT_USE_CALLOC_WITHOUT_RAX_PREFIX_XXX
+#define realloc(x,y) XXX_DONT_USE_REALLOC_WITHOUT_RAX_PREFIX_XXX
+#define malloc_aligned(x) XXX_DONT_USE_MALLOC_ALIGNED_WITHOUT_RAX_PREFIX_XXX
+#define posix_memalign(x,y,z) XXX_DONT_USE_POSIX_MEMALIGN_ALIGNED_WITHOUT_RAX_PREFIX_XXX
+#endif
+#endif
+
+#endif
diff --git a/pll/mic_native.h b/pll/mic_native.h
new file mode 100644
index 0000000..38b24a3
--- /dev/null
+++ b/pll/mic_native.h
@@ -0,0 +1,56 @@
+#ifndef MIC_NATIVE_H_
+#define MIC_NATIVE_H_
+
+void newviewGTRGAMMA_MIC(int tipCase,
+                  double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                  int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling);
+
+double evaluateGTRGAMMA_MIC(int *ex1, int *ex2, int *wptr,
+                 double *x1_start, double *x2_start,
+                 double *tipVector,
+                 unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling);
+
+void sumGTRGAMMA_MIC(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n);
+
+void coreGTRGAMMA_MIC(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN, double *gammaRates, double lz, int *wrptr);
+
+// protein data
+void newviewGTRGAMMAPROT_MIC(int tipCase,
+                  double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                  int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling);
+
+double evaluateGTRGAMMAPROT_MIC(int *ex1, int *ex2, int *wptr,
+                 double *x1_start, double *x2_start,
+                 double *tipVector,
+                 unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling);
+
+void sumGTRGAMMAPROT_MIC(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n);
+
+void coreGTRGAMMAPROT_MIC(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN, double *gammaRates, double lz, int *wrptr);
+
+// protein data - LG4
+
+void newviewGTRGAMMAPROT_LG4_MIC(int tipCase,
+                  double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
+                  unsigned char *tipX1, unsigned char *tipX2,
+                  int n, double *left, double *right, int *wgt, int *scalerIncrement);
+
+double evaluateGTRGAMMAPROT_LG4_MIC(int *wptr,
+                 double *x1_start, double *x2_start,
+                 double *tipVector[4],
+                 unsigned char *tipX1, const int n, double *diagptable);
+
+void sumGTRGAMMAPROT_LG4_MIC(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector[4],
+    unsigned char *tipX1, unsigned char *tipX2, int n);
+
+void coreGTRGAMMAPROT_LG4_MIC(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN[4], double *gammaRates, double lz, int *wrptr);
+
+
+#endif /* MIC_NATIVE_H_ */
diff --git a/pll/mic_native_aa.c b/pll/mic_native_aa.c
new file mode 100644
index 0000000..2cfd2b1
--- /dev/null
+++ b/pll/mic_native_aa.c
@@ -0,0 +1,1254 @@
+#include <omp.h>
+#include <immintrin.h>
+#include <string.h>
+#include <math.h>
+
+#include "pll.h"
+#include "mic_native.h"
+
+static const int states = 20;
+static const int statesSquare = 20 * 20;
+static const int span = 20 * 4;
+static const int maxStateValue = 23;
+
+__inline void mic_fma4x80(const double* inv, double* outv, double* mulv)
+{
+    __mmask8 k1 = _mm512_int2mask(0x0F);
+    __mmask8 k2 = _mm512_int2mask(0xF0);
+    for(int l = 0; l < 80; l += 40)
+    {
+        __m512d t = _mm512_setzero_pd();
+
+        t = _mm512_extload_pd(&inv[l], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
+        __m512d m = _mm512_load_pd(&mulv[l]);
+        __m512d acc = _mm512_load_pd(&outv[l]);
+        __m512d r = _mm512_fmadd_pd(t, m, acc);
+        _mm512_store_pd(&outv[l], r);
+
+        m = _mm512_load_pd(&mulv[l + 8]);
+        acc = _mm512_load_pd(&outv[l + 8]);
+        r = _mm512_fmadd_pd(t, m, acc);
+        _mm512_store_pd(&outv[l + 8], r);
+
+        t = _mm512_mask_extload_pd(t, k1, &inv[l], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
+        t = _mm512_mask_extload_pd(t, k2, &inv[l+20], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
+
+        m = _mm512_load_pd(&mulv[l + 16]);
+        acc = _mm512_load_pd(&outv[l + 16]);
+        r = _mm512_fmadd_pd(t, m, acc);
+        _mm512_store_pd(&outv[l + 16], r);
+
+        t = _mm512_extload_pd(&inv[l+20], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
+        m = _mm512_load_pd(&mulv[l + 24]);
+        acc = _mm512_load_pd(&outv[l + 24]);
+        r = _mm512_fmadd_pd(t, m, acc);
+        _mm512_store_pd(&outv[l + 24], r);
+
+        m = _mm512_load_pd(&mulv[l + 32]);
+        acc = _mm512_load_pd(&outv[l + 32]);
+        r = _mm512_fmadd_pd(t, m, acc);
+        _mm512_store_pd(&outv[l + 32], r);
+    }
+}
+
+
+void newviewGTRGAMMAPROT_MIC(int tipCase,
+                  double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                  int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling)
+{
+  __m512d minlikelihood_MIC = _mm512_set1_pd(PLL_MINLIKELIHOOD);
+  __m512d twotothe256_MIC = _mm512_set1_pd(PLL_TWOTOTHE256);
+  __m512i absMask_MIC = _mm512_set1_epi64(0x7fffffffffffffffULL);
+
+  int addScale = 0;
+
+  double aEV[1600] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+  #pragma ivdep
+  for (int l = 0; l < 1600; ++l)
+  {
+      aEV[l] = extEV[(l / span) * states + (l % states)];
+  }
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        /* multiply all possible tip state vectors with the respective P-matrices
+        */
+
+        double umpX1[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double umpX2[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+        for(int i = 0; i < maxStateValue; ++i)
+        {
+          for(int k = 0; k < span; ++k)
+          {
+              umpX1[i * span + k] = 0.0;
+              umpX2[i * span + k] = 0.0;
+
+              #pragma ivdep
+              for(int l = 0; l < states; ++l)
+              {
+                  umpX1[i * span + k] +=  tipVector[i * states + l] *  left[k * states + l];
+                  umpX2[i * span + k] +=  tipVector[i * states + l] * right[k * states + l];
+              }
+          }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+            const double *uX1 = &umpX1[span * tipX1[i]];
+            const double *uX2 = &umpX2[span * tipX2[i]];
+
+            double uX[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double* v3 = &x3[i * span];
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+                for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&uX[k], v3, &aEV[k * span]);
+            }
+
+            // init scaling counter for the site
+            if (!fastScaling)
+                ex3[i] = 0;
+
+        } // sites loop
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        /* we do analogous pre-computations as above, with the only difference that we now do them
+        only for one tip vector */
+
+          double umpX1[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+        /* precompute P and left tip vector product */
+
+        for(int i = 0; i < maxStateValue; ++i)
+        {
+          for(int k = 0; k < span; ++k)
+          {
+              umpX1[i * span + k] = 0.0;
+
+              #pragma ivdep
+              for(int l = 0; l < states; ++l)
+              {
+                  umpX1[i * span + k] +=  tipVector[i * states + l] *  left[k * states + l];
+              }
+          }
+        }
+
+        // re-arrange right matrix for better memory layout
+        double aRight[4 * statesSquare] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int j = 0; j < 4; j++)
+        {
+            for(int k = 0; k < states; k++)
+            {
+                for(int l = 0; l < states; l++)
+                {
+                    aRight[k * span + j * states + l] = right[j * statesSquare +  l * states + k];
+                }
+            }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+            #pragma unroll(10)
+            for (int j = 0; j < span; j += 8)
+            {
+                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
+            }
+
+            /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
+            double* uX1 = &umpX1[span * tipX1[i]];
+            double uX2[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double uX[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+            double* v3 = &(x3[span * i]);
+
+            const double* v2 = &(x2[span * i]);
+
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX2[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+                for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&v2[k], uX2, &aRight[k * span]);
+            }
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+                for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&uX[k], v3, &aEV[k * span]);
+            }
+
+            __m512d t1 = _mm512_load_pd(&v3[0]);
+            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
+            double vmax = _mm512_reduce_gmax_pd(t1);
+            double mx[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            for (int l = 8; l < span; l += 8)
+            {
+                __m512d t = _mm512_load_pd(&v3[l]);
+                t = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t), absMask_MIC));
+                double vmax2 = _mm512_reduce_gmax_pd(t);
+                vmax = PLL_MAX(vmax, vmax2);
+            }
+
+            if (vmax < PLL_MINLIKELIHOOD)
+            {
+                #pragma vector aligned nontemporal
+                for(int l = 0; l < span; l++)
+                  v3[l] *= PLL_TWOTOTHE256;
+
+                if(!fastScaling)
+                  ex3[i] += 1;
+                else
+                  addScale += wgt[i];
+            }
+        } // site loop
+
+      }
+      break;
+    case PLL_INNER_INNER:
+    {
+      /* same as above, without pre-computations */
+
+
+        // re-arrange right matrix for better memory layout
+        double aLeft[4 * statesSquare] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double aRight[4 * statesSquare] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int j = 0; j < 4; j++)
+        {
+            for(int k = 0; k < states; k++)
+            {
+                for(int l = 0; l < states; l++)
+                {
+                    aLeft[k * span + j * states + l] = left[j * statesSquare + l * states + k];
+                    aRight[k * span + j * states + l] = right[j * statesSquare + l * states + k];
+                }
+            }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+
+            #pragma unroll(10)
+            for (int j = 0; j < span; j += 8)
+            {
+                _mm_prefetch((const char *)&x1[span*(i+1) + j], _MM_HINT_T1);
+                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
+            }
+
+
+            double uX1[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double uX2[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double uX[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+            double* v3 = &(x3[span * i]);
+
+            const double* v1 = &(x1[span * i]);
+            const double* v2 = &(x2[span * i]);
+
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX1[l] = 0.;
+                uX2[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+                for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
+                    _mm_prefetch((const char *)&aLeft[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&v1[k], uX1, &aLeft[k * span]);
+                mic_fma4x80(&v2[k], uX2, &aRight[k * span]);
+            }
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+                for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&uX[k], v3, &aEV[k * span]);
+            }
+
+            __m512d t1 = _mm512_load_pd(&v3[0]);
+            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
+            double vmax = _mm512_reduce_gmax_pd(t1);
+            double mx[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            for (int l = 8; l < span; l += 8)
+            {
+                __m512d t = _mm512_load_pd(&v3[l]);
+                t = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t), absMask_MIC));
+                double vmax2 = _mm512_reduce_gmax_pd(t);
+                vmax = PLL_MAX(vmax, vmax2);
+            }
+
+            if (vmax < PLL_MINLIKELIHOOD)
+            {
+                #pragma vector aligned nontemporal
+                for(int l = 0; l < span; l++)
+                  v3[l] *= PLL_TWOTOTHE256;
+
+                if(!fastScaling)
+                  ex3[i] += 1;
+                else
+                  addScale += wgt[i];
+            }
+        }
+    } break;
+    default:
+//      assert(0);
+      break;
+  }
+
+  *scalerIncrement = addScale;
+
+}
+
+
+
+double evaluateGTRGAMMAPROT_MIC(int *ex1, int *ex2, int *wgt, double *x1_start, double *x2_start, double *tipVector,
+                 unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling)
+{
+    double sum = 0.0;
+
+    /* the left node is a tip */
+    if(tipX1)
+    {
+        double aTipVec[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int k = 0; k < maxStateValue; k++)
+        {
+            for(int l = 0; l < states; l++)
+            {
+                aTipVec[k*span + l] = aTipVec[k*span + states + l] = aTipVec[k*span + 2*states + l] = aTipVec[k*span + 3*states + l] = tipVector[k*states + l];
+            }
+        }
+
+        /* loop over the sites of this partition */
+        for (int i = 0; i < n; i++)
+        {
+          /* access pre-computed tip vector values via a lookup table */
+          const double *x1 = &(aTipVec[span * tipX1[i]]);
+          /* access the other(inner) node at the other end of the branch */
+          const double *x2 = &(x2_start[span * i]);
+
+          double term = 0.;
+
+          #pragma ivdep
+          #pragma vector aligned
+          for(int j = 0; j < span; j++) {
+              term += x1[j] * x2[j] * diagptable[j];
+          }
+
+          if(!fastScaling)
+              term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+          else
+              term = log(0.25 * fabs(term));
+
+          sum += wgt[i] * term;
+        }
+    }
+    else
+    {
+        for (int i = 0; i < n; i++)
+        {
+            _mm_prefetch((const char *) &x1_start[span*(i+8)], _MM_HINT_T1);
+            _mm_prefetch((const char *) &x1_start[span*(i+8) + 8], _MM_HINT_T1);
+            _mm_prefetch((const char *) &x2_start[span*(i+8)], _MM_HINT_T1);
+            _mm_prefetch((const char *) &x2_start[span*(i+8) + 8], _MM_HINT_T1);
+
+            _mm_prefetch((const char *) &x1_start[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char *) &x1_start[span*(i+1) + 8], _MM_HINT_T0);
+            _mm_prefetch((const char *) &x2_start[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char *) &x2_start[span*(i+1) + 8], _MM_HINT_T0);
+
+          const double *x1 = &(x1_start[span * i]);
+          const double *x2 = &(x2_start[span * i]);
+
+          double term = 0.;
+
+          #pragma ivdep
+          #pragma vector aligned
+          for(int j = 0; j < span; j++)
+              term += x1[j] * x2[j] * diagptable[j];
+
+          if(!fastScaling)
+              term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+          else
+              term = log(0.25 * fabs(term));
+
+          sum += wgt[i] * term;
+        }
+    }
+
+    return sum;
+}
+
+void sumGTRGAMMAPROT_MIC(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+    double aTipVec[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    for(int k = 0; k < maxStateValue; k++)
+    {
+        for(int l = 0; l < states; l++)
+        {
+            aTipVec[k*span + l] = aTipVec[k*span + states + l] = aTipVec[k*span + 2*states + l] = aTipVec[k*span + 3*states + l] = tipVector[k*states + l];
+        }
+    }
+
+    switch(tipCase)
+    {
+      case PLL_TIP_TIP:
+      {
+        for(int i = 0; i < n; i++)
+        {
+            const double *left  = &(aTipVec[span * tipX1[i]]);
+            const double *right = &(aTipVec[span * tipX2[i]]);
+
+            #pragma ivdep
+            #pragma vector aligned nontemporal
+            for(int l = 0; l < span; l++)
+            {
+                sumtable[i * span + l] = left[l] * right[l];
+            }
+        }
+      } break;
+      case PLL_TIP_INNER:
+      {
+        for(int i = 0; i < n; i++)
+        {
+          _mm_prefetch((const char *) &x2_start[span*(i+16)], _MM_HINT_T1);
+          _mm_prefetch((const char *) &x2_start[span*(i+16) + 8], _MM_HINT_T1);
+
+          _mm_prefetch((const char *) &x2_start[span*(i+2)], _MM_HINT_T0);
+          _mm_prefetch((const char *) &x2_start[span*(i+2) + 8], _MM_HINT_T0);
+
+          const double *left = &(aTipVec[span * tipX1[i]]);
+          const double *right = &(x2_start[span * i]);
+
+          #pragma ivdep
+          #pragma vector aligned nontemporal
+          for(int l = 0; l < span; l++)
+          {
+              sumtable[i * span + l] = left[l] * right[l];
+          }
+        }
+      } break;
+      case PLL_INNER_INNER:
+      {
+        for(int i = 0; i < n; i++)
+        {
+            _mm_prefetch((const char *) &x1_start[span*(i+16)], _MM_HINT_T1);
+            _mm_prefetch((const char *) &x1_start[span*(i+16) + 8], _MM_HINT_T1);
+            _mm_prefetch((const char *) &x2_start[span*(i+16)], _MM_HINT_T1);
+            _mm_prefetch((const char *) &x2_start[span*(i+16) + 8], _MM_HINT_T1);
+
+            _mm_prefetch((const char *) &x1_start[span*(i+2)], _MM_HINT_T0);
+            _mm_prefetch((const char *) &x1_start[span*(i+2) + 8], _MM_HINT_T0);
+            _mm_prefetch((const char *) &x2_start[span*(i+2)], _MM_HINT_T0);
+            _mm_prefetch((const char *) &x2_start[span*(i+2) + 8], _MM_HINT_T0);
+
+            const double *left  = &(x1_start[span * i]);
+            const double *right = &(x2_start[span * i]);
+
+            #pragma ivdep
+            #pragma vector aligned nontemporal
+            for(int l = 0; l < span; l++)
+            {
+                sumtable[i * span + l] = left[l] * right[l];
+            }
+        }
+      } break;
+  //    default:
+  //      assert(0);
+    }
+}
+
+void coreGTRGAMMAPROT_MIC(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN, double *gammaRates, double lz, int *wgt)
+{
+    double diagptable0[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable1[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable2[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable01[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable02[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+    /* pre-compute the derivatives of the P matrix for all discrete GAMMA rates */
+
+    for(int i = 0; i < 4; i++)
+    {
+        const double ki = gammaRates[i];
+        const double kisqr = ki * ki;
+
+        diagptable0[i*states] = 1.;
+        diagptable1[i*states] = 0.;
+        diagptable2[i*states] = 0.;
+
+        for(int l = 1; l < states; l++)
+        {
+          diagptable0[i * states + l]  = exp(EIGN[l] * ki * lz);
+          diagptable1[i * states + l] = EIGN[l] * ki;
+          diagptable2[i * states + l] = EIGN[l] * EIGN[l] * kisqr;
+        }
+    }
+
+    #pragma ivdep
+    for(int i = 0; i < span; i++)
+    {
+        diagptable01[i] = diagptable0[i] * diagptable1[i];
+        diagptable02[i] = diagptable0[i] * diagptable2[i];
+    }
+
+    /* loop over sites in this partition */
+
+    const int aligned_width = upper % PLL_VECTOR_WIDTH == 0 ? upper / PLL_VECTOR_WIDTH : upper / PLL_VECTOR_WIDTH + 1;
+
+    double dlnLdlz = 0.;
+    double d2lnLdlz2 = 0.;
+
+    __mmask16 k1 = _mm512_int2mask(0x000000FF);
+
+    for (int i = 0; i < aligned_width; i++)
+    {
+        _mm_prefetch((const char *) &sumtable[i * span * 8], _MM_HINT_T0);
+        _mm_prefetch((const char *) &sumtable[i * span * 8 + 8], _MM_HINT_T0);
+
+        /* access the array with pre-computed values */
+        const double *sum = &sumtable[i * span * PLL_VECTOR_WIDTH];
+
+        /* initial per-site likelihood and 1st and 2nd derivatives */
+
+        double invBuf[PLL_VECTOR_WIDTH] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double d1Buf[PLL_VECTOR_WIDTH] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double d2Buf[PLL_VECTOR_WIDTH] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+        __m512d invVec;
+        __m512d d1Vec;
+        __m512d d2Vec;
+        int mask = 0x01;
+
+        #pragma noprefetch sum
+        #pragma unroll(8)
+        for(int j = 0; j < PLL_VECTOR_WIDTH; j++)
+        {
+            _mm_prefetch((const char *) &sum[span*(j+8)], _MM_HINT_T1);
+            _mm_prefetch((const char *) &sum[span*(j+8) + 8], _MM_HINT_T1);
+
+            _mm_prefetch((const char *) &sum[span*(j+1)], _MM_HINT_T0);
+            _mm_prefetch((const char *) &sum[span*(j+1) + 8], _MM_HINT_T0);
+
+            __m512d inv_1 = _mm512_setzero_pd();
+            __m512d d1_1 = _mm512_setzero_pd();
+            __m512d d2_1 = _mm512_setzero_pd();
+
+            for (int offset = 0; offset < span; offset += 8)
+            {
+                __m512d d0_1 = _mm512_load_pd(&diagptable0[offset]);
+                __m512d d01_1 = _mm512_load_pd(&diagptable01[offset]);
+                __m512d d02_1 = _mm512_load_pd(&diagptable02[offset]);
+                __m512d s_1 = _mm512_load_pd(&sum[j*span + offset]);
+
+                inv_1 = _mm512_fmadd_pd(d0_1, s_1, inv_1);
+                d1_1 = _mm512_fmadd_pd(d01_1, s_1, d1_1);
+                d2_1 = _mm512_fmadd_pd(d02_1, s_1, d2_1);
+            }
+
+            __mmask8 k1 = _mm512_int2mask(mask);
+            mask <<= 1;
+
+            // reduce
+            inv_1 = _mm512_add_pd (inv_1, _mm512_swizzle_pd(inv_1, _MM_SWIZ_REG_CDAB));
+            inv_1 = _mm512_add_pd (inv_1, _mm512_swizzle_pd(inv_1, _MM_SWIZ_REG_BADC));
+            inv_1 = _mm512_add_pd (inv_1, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(inv_1), _MM_PERM_BADC)));
+            invVec = _mm512_mask_mov_pd(invVec, k1, inv_1);
+
+            d1_1 = _mm512_add_pd (d1_1, _mm512_swizzle_pd(d1_1, _MM_SWIZ_REG_CDAB));
+            d1_1 = _mm512_add_pd (d1_1, _mm512_swizzle_pd(d1_1, _MM_SWIZ_REG_BADC));
+            d1_1 = _mm512_add_pd (d1_1, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(d1_1), _MM_PERM_BADC)));
+            d1Vec = _mm512_mask_mov_pd(d1Vec, k1, d1_1);
+
+            d2_1 = _mm512_add_pd (d2_1, _mm512_swizzle_pd(d2_1, _MM_SWIZ_REG_CDAB));
+            d2_1 = _mm512_add_pd (d2_1, _mm512_swizzle_pd(d2_1, _MM_SWIZ_REG_BADC));
+            d2_1 = _mm512_add_pd (d2_1, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(d2_1), _MM_PERM_BADC)));
+            d2Vec = _mm512_mask_mov_pd(d2Vec, k1, d2_1);
+        }
+
+        _mm512_store_pd(&invBuf[0], invVec);
+        _mm512_store_pd(&d1Buf[0], d1Vec);
+        _mm512_store_pd(&d2Buf[0], d2Vec);
+
+        #pragma ivdep
+        #pragma vector aligned
+        for (int j = 0; j < PLL_VECTOR_WIDTH; ++j)
+        {
+            const double inv_Li = 1.0 / invBuf[j];
+
+            const double d1 = d1Buf[j] * inv_Li;
+            const double d2 = d2Buf[j] * inv_Li;
+
+            dlnLdlz += wgt[i * PLL_VECTOR_WIDTH + j] * d1;
+            d2lnLdlz2 += wgt[i * PLL_VECTOR_WIDTH + j] * (d2 - d1 * d1);
+        }
+    } // site loop
+
+    *ext_dlnLdlz   = dlnLdlz;
+    *ext_d2lnLdlz2 = d2lnLdlz2;
+}
+
+
+/****
+ *       PROTEIN - LG4
+ */
+
+void newviewGTRGAMMAPROT_LG4_MIC(int tipCase,
+                  double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
+                  unsigned char *tipX1, unsigned char *tipX2,
+                  int n, double *left, double *right, int *wgt, int *scalerIncrement)
+{
+
+  __m512d minlikelihood_MIC = _mm512_set1_pd(PLL_MINLIKELIHOOD);
+  __m512d twotothe256_MIC = _mm512_set1_pd(PLL_TWOTOTHE256);
+  __m512i absMask_MIC = _mm512_set1_epi64(0x7fffffffffffffffULL);
+
+  int addScale = 0;
+
+  double aEV[1600] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+  #pragma ivdep
+  for (int l = 0; l < 1600; ++l)
+  {
+      aEV[l] = extEV[(l % span) / states][(l / span) * states + (l % states)];
+  }
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        /* multiply all possible tip state vectors with the respective P-matrices
+        */
+
+        double umpX1[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double umpX2[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+        for(int i = 0; i < 23; ++i)
+        {
+          for(int k = 0; k < span; ++k)
+          {
+              umpX1[i * span + k] = 0.0;
+              umpX2[i * span + k] = 0.0;
+              double *tipv = &(tipVector[k / states][i * states]);
+
+
+              #pragma ivdep
+              for(int l = 0; l < states; ++l)
+              {
+                  umpX1[i * span + k] +=  tipv[l] *  left[k * states + l];
+                  umpX2[i * span + k] +=  tipv[l] * right[k * states + l];
+              }
+          }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+            const double *uX1 = &umpX1[span * tipX1[i]];
+            const double *uX2 = &umpX2[span * tipX2[i]];
+
+            double uX[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double* v3 = &x3[i * span];
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+                for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&uX[k], v3, &aEV[k * span]);
+            }
+
+        } // sites loop
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        /* we do analogous pre-computations as above, with the only difference that we now do them
+        only for one tip vector */
+
+          double umpX1[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+        /* precompute P and left tip vector product */
+
+        for(int i = 0; i < 23; ++i)
+        {
+          for(int k = 0; k < span; ++k)
+          {
+              umpX1[i * span + k] = 0.0;
+              double *tipv = &(tipVector[k / states][i * states]);
+
+              #pragma ivdep
+              for(int l = 0; l < states; ++l)
+              {
+                  umpX1[i * span + k] +=  tipv[l] *  left[k * states + l];
+              }
+          }
+        }
+
+        // re-arrange right matrix for better memory layout
+        double aRight[4 * statesSquare] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int j = 0; j < 4; j++)
+        {
+            for(int k = 0; k < states; k++)
+            {
+                for(int l = 0; l < states; l++)
+                {
+                    aRight[k * span + j * states + l] = right[j * statesSquare +  l * states + k];
+                }
+            }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+            #pragma unroll(10)
+            for (int j = 0; j < span; j += 8)
+            {
+                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
+            }
+
+            /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
+            double* uX1 = &umpX1[span * tipX1[i]];
+            double uX2[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double uX[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+            double* v3 = &(x3[span * i]);
+
+            const double* v2 = &(x2[span * i]);
+
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX2[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+				#pragma unroll(10)
+            	for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&v2[k], uX2, &aRight[k * span]);
+            }
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+				#pragma unroll(10)
+            	for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&uX[k], v3, &aEV[k * span]);
+            }
+
+
+            __m512d t1 = _mm512_load_pd(&v3[0]);
+            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
+            double vmax = _mm512_reduce_gmax_pd(t1);
+            double mx[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            for (int l = 8; l < span; l += 8)
+            {
+                __m512d t = _mm512_load_pd(&v3[l]);
+                t = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t), absMask_MIC));
+                double vmax2 = _mm512_reduce_gmax_pd(t);
+                vmax = PLL_MAX(vmax, vmax2);
+            }
+
+            if (vmax < PLL_MINLIKELIHOOD)
+            {
+                #pragma vector aligned nontemporal
+                for(int l = 0; l < span; l++)
+                  v3[l] *= PLL_TWOTOTHE256;
+
+                addScale += wgt[i];
+            }
+        } // site loop
+
+      }
+      break;
+    case PLL_INNER_INNER:
+    {
+      /* same as above, without pre-computations */
+
+        // re-arrange right matrix for better memory layout
+        double aLeft[4 * statesSquare] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double aRight[4 * statesSquare] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int j = 0; j < 4; j++)
+        {
+            for(int k = 0; k < states; k++)
+            {
+                for(int l = 0; l < states; l++)
+                {
+                    aLeft[k * span + j * states + l] = left[j * statesSquare + l * states + k];
+                    aRight[k * span + j * states + l] = right[j * statesSquare + l * states + k];
+                }
+            }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+
+            #pragma unroll(10)
+            for (int j = 0; j < span; j += 8)
+            {
+                _mm_prefetch((const char *)&x1[span*(i+1) + j], _MM_HINT_T1);
+                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
+            }
+
+
+            double uX1[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double uX2[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double uX[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+            double* v3 = &(x3[span * i]);
+
+            const double* v1 = &(x1[span * i]);
+            const double* v2 = &(x2[span * i]);
+
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX1[l] = 0.;
+                uX2[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+				#pragma unroll(10)
+            	for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
+                    _mm_prefetch((const char *)&aLeft[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&v1[k], uX1, &aLeft[k * span]);
+                mic_fma4x80(&v2[k], uX2, &aRight[k * span]);
+            }
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+				#pragma unroll(10)
+            	for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&uX[k], v3, &aEV[k * span]);
+            }
+
+            __m512d t1 = _mm512_load_pd(&v3[0]);
+            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
+            double vmax = _mm512_reduce_gmax_pd(t1);
+            double mx[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            for (int l = 8; l < span; l += 8)
+            {
+                __m512d t = _mm512_load_pd(&v3[l]);
+                t = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t), absMask_MIC));
+                double vmax2 = _mm512_reduce_gmax_pd(t);
+                vmax = PLL_MAX(vmax, vmax2);
+            }
+
+            if (vmax < PLL_MINLIKELIHOOD)
+            {
+                #pragma vector aligned nontemporal
+                for(int l = 0; l < span; l++)
+                  v3[l] *= PLL_TWOTOTHE256;
+
+                addScale += wgt[i];
+            }
+        }
+    } break;
+    default:
+//      assert(0);
+      break;
+  }
+
+  *scalerIncrement = addScale;
+
+}
+
+
+
+double evaluateGTRGAMMAPROT_LG4_MIC(int *wgt, double *x1_start, double *x2_start, double *tipVector[4],
+                 unsigned char *tipX1, const int n, double *diagptable)
+{
+    double sum = 0.0;
+
+    /* the left node is a tip */
+    if(tipX1)
+    {
+        double aTipVec[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int k = 0; k < 23; k++)
+        {
+            for(int j = 0; j < 4; j++)
+            {
+				for(int l = 0; l < states; l++)
+				{
+					aTipVec[k*span + j*states + l] = tipVector[j][k*states + l];
+				}
+            }
+        }
+
+        /* loop over the sites of this partition */
+        for (int i = 0; i < n; i++)
+        {
+			/* access pre-computed tip vector values via a lookup table */
+			const double *x1 = &(aTipVec[span * tipX1[i]]);
+			/* access the other(inner) node at the other end of the branch */
+			const double *x2 = &(x2_start[span * i]);
+
+			#pragma unroll(10)
+			for (int k = 0; k < span; k += 8)
+			{
+				_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+				_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+			}
+
+			double term = 0.;
+
+			#pragma ivdep
+			#pragma vector aligned
+			#pragma noprefetch x2
+			for(int j = 0; j < span; j++) {
+			  term += x1[j] * x2[j] * diagptable[j];
+			}
+
+			term = log(0.25 * fabs(term));
+
+			sum += wgt[i] * term;
+        }
+    }
+    else
+    {
+        for (int i = 0; i < n; i++)
+        {
+			#pragma unroll(10)
+			for (int k = 0; k < span; k += 8)
+			{
+				_mm_prefetch((const char *) &x1_start[span*(i+2) + k], _MM_HINT_T1);
+				_mm_prefetch((const char *) &x1_start[span*(i+1) + k], _MM_HINT_T0);
+
+				_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+				_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+			}
+
+			const double *x1 = &(x1_start[span * i]);
+			const double *x2 = &(x2_start[span * i]);
+
+			double term = 0.;
+
+			#pragma ivdep
+			#pragma vector aligned
+			#pragma noprefetch x1 x2
+			for(int j = 0; j < span; j++)
+			  term += x1[j] * x2[j] * diagptable[j];
+
+			term = log(0.25 * fabs(term));
+
+			sum += wgt[i] * term;
+        }
+    }
+
+    return sum;
+}
+
+void sumGTRGAMMAPROT_LG4_MIC(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector[4],
+    unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+    double aTipVec[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    for(int k = 0; k < maxStateValue; k++)
+    {
+        for(int j = 0; j < 4; j++)
+        {
+			for(int l = 0; l < states; l++)
+			{
+				aTipVec[k*span + j*states + l] = tipVector[j][k*states + l];
+			}
+        }
+    }
+
+    switch(tipCase)
+    {
+      case PLL_TIP_TIP:
+      {
+        for(int i = 0; i < n; i++)
+        {
+            const double *left  = &(aTipVec[span * tipX1[i]]);
+            const double *right = &(aTipVec[span * tipX2[i]]);
+
+            #pragma ivdep
+            #pragma vector aligned nontemporal
+            for(int l = 0; l < span; l++)
+            {
+                sumtable[i * span + l] = left[l] * right[l];
+            }
+        }
+      } break;
+      case PLL_TIP_INNER:
+      {
+        for(int i = 0; i < n; i++)
+        {
+			#pragma unroll(10)
+			for (int k = 0; k < span; k += 8)
+			{
+				_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+				_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+			}
+
+          const double *left = &(aTipVec[span * tipX1[i]]);
+          const double *right = &(x2_start[span * i]);
+
+          #pragma ivdep
+          #pragma vector aligned nontemporal
+		  #pragma noprefetch right
+          for(int l = 0; l < span; l++)
+          {
+              sumtable[i * span + l] = left[l] * right[l];
+          }
+        }
+      } break;
+      case PLL_INNER_INNER:
+      {
+        for(int i = 0; i < n; i++)
+        {
+			#pragma unroll(10)
+			for (int k = 0; k < span; k += 8)
+			{
+				_mm_prefetch((const char *) &x1_start[span*(i+2) + k], _MM_HINT_T1);
+				_mm_prefetch((const char *) &x1_start[span*(i+1) + k], _MM_HINT_T0);
+
+				_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+				_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+			}
+
+            const double *left  = &(x1_start[span * i]);
+            const double *right = &(x2_start[span * i]);
+
+            #pragma ivdep
+            #pragma vector aligned nontemporal
+			#pragma noprefetch left right
+            for(int l = 0; l < span; l++)
+            {
+                sumtable[i * span + l] = left[l] * right[l];
+            }
+        }
+      } break;
+  //    default:
+  //      assert(0);
+    }
+}
+
+void coreGTRGAMMAPROT_LG4_MIC(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN[4], double *gammaRates, double lz, int *wgt)
+{
+    double diagptable0[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable1[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable2[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable01[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable02[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+    /* pre-compute the derivatives of the P matrix for all discrete GAMMA rates */
+
+    for(int i = 0; i < 4; i++)
+    {
+        const double ki = gammaRates[i];
+        const double kisqr = ki * ki;
+
+        diagptable0[i*states] = 1.;
+        diagptable1[i*states] = 0.;
+        diagptable2[i*states] = 0.;
+
+        for(int l = 1; l < states; l++)
+        {
+          diagptable0[i * states + l]  = exp(EIGN[i][l] * ki * lz);
+          diagptable1[i * states + l] = EIGN[i][l] * ki;
+          diagptable2[i * states + l] = EIGN[i][l] * EIGN[i][l] * kisqr;
+        }
+    }
+
+    #pragma ivdep
+    for(int i = 0; i < span; i++)
+    {
+        diagptable01[i] = diagptable0[i] * diagptable1[i];
+        diagptable02[i] = diagptable0[i] * diagptable2[i];
+    }
+
+    /* loop over sites in this partition */
+
+    const int aligned_width = upper % 8 == 0 ? upper / 8 : upper / 8 + 1;
+
+    double dlnLdlz = 0.;
+    double d2lnLdlz2 = 0.;
+
+    __mmask16 k1 = _mm512_int2mask(0x000000FF);
+
+    for (int i = 0; i < aligned_width; i++)
+    {
+        /* access the array with pre-computed values */
+        const double *sum = &sumtable[i * span * 8];
+
+        /* initial per-site likelihood and 1st and 2nd derivatives */
+
+        double invBuf[8] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double d1Buf[8] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double d2Buf[8] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+        __m512d invVec;
+        __m512d d1Vec;
+        __m512d d2Vec;
+        int mask = 0x01;
+
+        #pragma noprefetch sum
+        #pragma unroll(8)
+        for(int j = 0; j < 8; j++)
+        {
+
+        	#pragma unroll(10)
+			for (int k = 0; k < span; k += 8)
+			{
+				_mm_prefetch((const char *) &sum[span*(j+2) + k], _MM_HINT_T1);
+				_mm_prefetch((const char *) &sum[span*(j+1) + k], _MM_HINT_T0);
+			}
+
+            __m512d inv_1 = _mm512_setzero_pd();
+            __m512d d1_1 = _mm512_setzero_pd();
+            __m512d d2_1 = _mm512_setzero_pd();
+
+            for (int offset = 0; offset < span; offset += 8)
+            {
+                __m512d d0_1 = _mm512_load_pd(&diagptable0[offset]);
+                __m512d d01_1 = _mm512_load_pd(&diagptable01[offset]);
+                __m512d d02_1 = _mm512_load_pd(&diagptable02[offset]);
+                __m512d s_1 = _mm512_load_pd(&sum[j*span + offset]);
+
+                inv_1 = _mm512_fmadd_pd(d0_1, s_1, inv_1);
+                d1_1 = _mm512_fmadd_pd(d01_1, s_1, d1_1);
+                d2_1 = _mm512_fmadd_pd(d02_1, s_1, d2_1);
+            }
+
+            __mmask8 k1 = _mm512_int2mask(mask);
+            mask <<= 1;
+
+            // reduce
+            inv_1 = _mm512_add_pd (inv_1, _mm512_swizzle_pd(inv_1, _MM_SWIZ_REG_CDAB));
+            inv_1 = _mm512_add_pd (inv_1, _mm512_swizzle_pd(inv_1, _MM_SWIZ_REG_BADC));
+            inv_1 = _mm512_add_pd (inv_1, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(inv_1), _MM_PERM_BADC)));
+            invVec = _mm512_mask_mov_pd(invVec, k1, inv_1);
+
+            d1_1 = _mm512_add_pd (d1_1, _mm512_swizzle_pd(d1_1, _MM_SWIZ_REG_CDAB));
+            d1_1 = _mm512_add_pd (d1_1, _mm512_swizzle_pd(d1_1, _MM_SWIZ_REG_BADC));
+            d1_1 = _mm512_add_pd (d1_1, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(d1_1), _MM_PERM_BADC)));
+            d1Vec = _mm512_mask_mov_pd(d1Vec, k1, d1_1);
+
+            d2_1 = _mm512_add_pd (d2_1, _mm512_swizzle_pd(d2_1, _MM_SWIZ_REG_CDAB));
+            d2_1 = _mm512_add_pd (d2_1, _mm512_swizzle_pd(d2_1, _MM_SWIZ_REG_BADC));
+            d2_1 = _mm512_add_pd (d2_1, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(d2_1), _MM_PERM_BADC)));
+            d2Vec = _mm512_mask_mov_pd(d2Vec, k1, d2_1);
+        }
+
+        _mm512_store_pd(&invBuf[0], invVec);
+        _mm512_store_pd(&d1Buf[0], d1Vec);
+        _mm512_store_pd(&d2Buf[0], d2Vec);
+
+        #pragma ivdep
+        #pragma vector aligned
+        for (int j = 0; j < 8; ++j)
+        {
+            const double inv_Li = 1.0 / invBuf[j];
+
+            const double d1 = d1Buf[j] * inv_Li;
+            const double d2 = d2Buf[j] * inv_Li;
+
+            dlnLdlz += wgt[i * 8 + j] * d1;
+            d2lnLdlz2 += wgt[i * 8 + j] * (d2 - d1 * d1);
+        }
+    } // site loop
+
+    *ext_dlnLdlz   = dlnLdlz;
+    *ext_d2lnLdlz2 = d2lnLdlz2;
+}
+
diff --git a/pll/mic_native_dna.c b/pll/mic_native_dna.c
new file mode 100644
index 0000000..6dd6631
--- /dev/null
+++ b/pll/mic_native_dna.c
@@ -0,0 +1,676 @@
+#include <omp.h>
+#include <immintrin.h>
+#include <string.h>
+#include <math.h>
+
+#include "pll.h"
+#include "mic_native.h"
+
+static const int states = 4;
+static const int statesSquare = 16;
+static const int span = 4 * 4;
+static const int maxStateValue = 16;
+
+__inline void mic_broadcast16x64(const double* inv, double* outv)
+{
+    __mmask8 k1 = _mm512_int2mask(0x0F);
+    __mmask8 k2 = _mm512_int2mask(0xF0);
+    for(int l = 0; l < 16; l += 2)
+    {
+        __m512d t = _mm512_setzero_pd();
+        t = _mm512_mask_extload_pd(t, k1, &inv[(l%4)*4 + l/4], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
+        t = _mm512_mask_extload_pd(t, k2, &inv[((l+1)%4)*4 + (l+1)/4], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
+
+        _mm512_store_pd(&outv[l*4], t);
+    }
+}
+
+void newviewGTRGAMMA_MIC(int tipCase,
+                  double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                  int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling)
+{
+    __m512d minlikelihood_MIC = _mm512_set1_pd(PLL_MINLIKELIHOOD);
+    __m512d twotothe256_MIC = _mm512_set1_pd(PLL_TWOTOTHE256);
+    __m512i absMask_MIC = _mm512_set1_epi64(0x7fffffffffffffffULL);
+
+	int addScale = 0;
+
+    double aEV[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+    #pragma ivdep
+    for (int l = 0; l < 64; ++l)
+    {
+        aEV[l] = extEV[(l / 16) * 4 + (l % 4)];
+    }
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        /* multiply all possible tip state vectors with the respective P-matrices
+        */
+
+            double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double umpX2[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+            for(int k = 0; k < 256; ++k)
+            {
+                umpX1[k] = 0.0;
+                umpX2[k] = 0.0;
+            }
+
+            for(int i = 0; i < maxStateValue; ++i)
+            {
+              for(int l = 0; l < states; ++l)
+              {
+                  #pragma ivdep
+                  for(int k = 0; k < span; ++k)
+                  {
+                      umpX1[16 * i + k] +=  tipVector[i * 4 + l] *  left[k * 4 + l];
+                      umpX2[16 * i + k] +=  tipVector[i * 4 + l] * right[k * 4 + l];
+                  }
+              }
+            }
+
+        double auX[64] __attribute__((align(64)));
+
+        for(int i = 0; i < n; ++i)
+        {
+            _mm_prefetch((const char*) (const char*) &x3[span*(i+8)], _MM_HINT_ET1);
+            _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1);
+
+            _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0);
+            _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0);
+
+            const double *uX1 = &umpX1[16 * tipX1[i]];
+            const double *uX2 = &umpX2[16 * tipX2[i]];
+
+            double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double* v = &x3[i * 16];
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < 16; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v[l] = 0.;
+            }
+
+            mic_broadcast16x64(uX, auX);
+
+            for (int j = 0; j < 4; ++j)
+            {
+                #pragma ivdep
+                #pragma vector aligned
+                #pragma vector nontemporal
+                for(int k = 0; k < 16; ++k)
+                {
+                    v[k] += auX[j*16 + k] * aEV[j*16 + k];
+                }
+            }
+
+            // init scaling counter for the site
+            if (!fastScaling)
+                ex3[i] = 0;
+
+        } // sites loop
+
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        /* we do analogous pre-computations as above, with the only difference that we now do them
+        only for one tip vector */
+
+          double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+        /* precompute P and left tip vector product */
+
+        for(int k = 0; k < 256; ++k)
+        {
+            umpX1[k] = 0.0;
+        }
+
+        for(int i = 0; i < 16; ++i)
+        {
+          for(int l = 0; l < 4; ++l)
+          {
+              #pragma ivdep
+              for(int k = 0; k < 16; ++k)
+              {
+                  umpX1[16 * i + k] +=  tipVector[i * 4 + l] *  left[k * 4 + l];
+              }
+          }
+        }
+
+        // re-arrange right matrix for better memory layout
+        double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int j = 0; j < 4; j++)
+        {
+            for(int l = 0; l < 16; l++)
+            {
+                aRight[j*16 + l] = right[l*4 + j];
+            }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+            _mm_prefetch((const char*) &x2[span*(i+16)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2[span*(i+16) + 8], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x3[span*(i+16)], _MM_HINT_ET1);
+            _mm_prefetch((const char*) &x3[span*(i+16) + 8], _MM_HINT_ET1);
+
+            _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0);
+            _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0);
+
+            /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
+            double* uX1 = &umpX1[span * tipX1[i]];
+            double uX2[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+            #pragma vector aligned
+            for(int l = 0; l < 16; ++l)
+            {
+                uX2[l] = 0.;
+            }
+
+            double aV2[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            const double* v2 = &(x2[16 * i]);
+
+            mic_broadcast16x64(v2, aV2);
+
+            for(int j = 0; j < 4; j++)
+            {
+                #pragma ivdep
+                #pragma vector aligned
+                for(int l = 0; l < 16; l++)
+                {
+                    uX2[l] += aV2[j*16 + l] * aRight[j*16 + l];
+                }
+            }
+
+            double* v3 = &(x3[span * i]);
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < 16; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            double auX[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            mic_broadcast16x64(uX, auX);
+
+            for (int j = 0; j < 4; ++j)
+            {
+                #pragma ivdep
+                #pragma vector aligned
+                for(int k = 0; k < 16; ++k)
+                {
+                    v3[k] += auX[j*16 + k] * aEV[j*16 + k];
+                }
+            }
+
+            __m512d t1 = _mm512_load_pd(&v3[0]);
+            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
+            double vmax1 = _mm512_reduce_gmax_pd(t1);
+            __m512d t2 = _mm512_load_pd(&v3[8]);
+            t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC));
+            double vmax2 = _mm512_reduce_gmax_pd(t2);
+
+            if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD)
+            {
+				t1 = _mm512_mul_pd(t1, twotothe256_MIC);
+				_mm512_store_pd(&v3[0], t1);
+				t2 = _mm512_mul_pd(t2, twotothe256_MIC);
+				_mm512_store_pd(&v3[8], t2);
+
+                if(!fastScaling)
+                  ex3[i] += 1;
+                else
+                  addScale += wgt[i];
+            }
+        } // site loop
+      }
+      break;
+    case PLL_INNER_INNER:
+    {
+      /* same as above, without pre-computations */
+
+        // re-arrange right matrix for better memory layout
+        double aLeft[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int j = 0; j < 4; j++)
+        {
+            for(int l = 0; l < 16; l++)
+            {
+                aLeft[j*16 + l] = left[l*4 + j];
+                aRight[j*16 + l] = right[l*4 + j];
+            }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+            _mm_prefetch((const char*) &x1[span*(i+8)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x1[span*(i+8) + 8], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2[span*(i+8)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2[span*(i+8) + 8], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x3[span*(i+8)], _MM_HINT_ET1);
+            _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1);
+
+            _mm_prefetch((const char*) &x1[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x1[span*(i+1) + 8], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0);
+            _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0);
+
+            double uX1[16] __attribute__((align(64)));
+            double uX2[16] __attribute__((align(64)));
+            double uX[16] __attribute__((align(64)));
+
+            for(int l = 0; l < 16; l++)
+            {
+              uX1[l] = 0.;
+              uX2[l] = 0.;
+            }
+
+            double aV1[64] __attribute__((align(64)));
+            double aV2[64] __attribute__((align(64)));
+
+            const double* v1 = &(x1[span * i]);
+            const double* v2 = &(x2[span * i]);
+
+            mic_broadcast16x64(v1, aV1);
+
+            mic_broadcast16x64(v2, aV2);
+
+            for(int j = 0; j < 4; j++)
+            {
+                #pragma ivdep
+                #pragma vector aligned
+                for(int l = 0; l < 16; l++)
+                {
+                    uX1[l] += aV1[j*16 + l] * aLeft[j*16 + l];
+                    uX2[l] += aV2[j*16 + l] * aRight[j*16 + l];
+                }
+            }
+
+            double* v3 =  &(x3[span * i]);
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < 16; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            double auX[64] __attribute__((align(64)));
+            mic_broadcast16x64(uX, auX);
+
+            for(int j = 0; j < 4; ++j)
+            {
+                #pragma ivdep
+                #pragma vector aligned
+                for(int k = 0; k < 16; ++k)
+                {
+                    v3[k] += auX[j*16 + k] * aEV[j*16 + k];
+                }
+            }
+
+
+            __m512d t1 = _mm512_load_pd(&v3[0]);
+            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
+            double vmax1 = _mm512_reduce_gmax_pd(t1);
+            __m512d t2 = _mm512_load_pd(&v3[8]);
+            t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC));
+            double vmax2 = _mm512_reduce_gmax_pd(t2);
+
+            if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD)
+            {
+				t1 = _mm512_mul_pd(t1, twotothe256_MIC);
+				_mm512_store_pd(&v3[0], t1);
+				t2 = _mm512_mul_pd(t2, twotothe256_MIC);
+				_mm512_store_pd(&v3[8], t2);
+
+                if(!fastScaling)
+                  ex3[i] += 1;
+                else
+                  addScale += wgt[i];
+            }
+        }
+    } break;
+    default:
+//      assert(0);
+      break;
+  }
+
+  /* as above, increment the global counter that counts scaling multiplications by the scaling multiplications
+     carried out for computing the likelihood array at node p */
+
+  if (fastScaling)
+  {
+      *scalerIncrement = addScale;
+  }
+
+}
+
+double evaluateGTRGAMMA_MIC(int *ex1, int *ex2, int *wgt,
+                 double *x1_start, double *x2_start,
+                 double *tipVector,
+                 unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling)
+{
+	double sum = 0.0;
+
+    /* the left node is a tip */
+    if(tipX1)
+    {
+
+        double aTipVec[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int k = 0; k < 16; k++)
+        {
+            for(int l = 0; l < 4; l++)
+            {
+                aTipVec[k*16 + l] = aTipVec[k*16 + 4 + l] = aTipVec[k*16 + 8 + l] = aTipVec[k*16 + 12 + l] = tipVector[k*4 + l];
+            }
+        }
+
+        /* loop over the sites of this partition */
+        for (int i = 0; i < n; i++)
+        {
+            _mm_prefetch((const char*) &x2_start[span*(i+8)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2_start[span*(i+8) + 8], _MM_HINT_T1);
+
+            _mm_prefetch((const char*) &x2_start[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2_start[span*(i+1) + 8], _MM_HINT_T0);
+
+          /* access pre-computed tip vector values via a lookup table */
+          const double *x1 = &(aTipVec[16 * tipX1[i]]);
+          /* access the other(inner) node at the other end of the branch */
+          const double *x2 = &(x2_start[span * i]);
+
+          double term = 0.;
+
+          #pragma ivdep
+          #pragma vector aligned
+          for(int j = 0; j < span; j++)
+              term += x1[j] * x2[j] * diagptable[j];
+
+          if(!fastScaling)
+              term = log(0.25 * term) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+          else
+              term = log(0.25 * term);
+
+          sum += wgt[i] * term;
+        }
+    }
+    else
+    {
+        for (int i = 0; i < n; i++)
+        {
+            _mm_prefetch((const char*) &x1_start[span*(i+8)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x1_start[span*(i+8) + 8], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2_start[span*(i+8)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2_start[span*(i+8) + 8], _MM_HINT_T1);
+
+            _mm_prefetch((const char*) &x1_start[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x1_start[span*(i+1) + 8], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2_start[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2_start[span*(i+1) + 8], _MM_HINT_T0);
+
+          const double *x1 = &(x1_start[span * i]);
+          const double *x2 = &(x2_start[span * i]);
+
+          double term = 0.;
+
+          #pragma ivdep
+          #pragma vector aligned
+          for(int j = 0; j < span; j++)
+              term += x1[j] * x2[j] * diagptable[j];
+
+          if(!fastScaling)
+              term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+          else
+              term = log(0.25 * term);
+
+          sum += wgt[i] * term;
+        }
+    }
+
+    return sum;
+}
+
+void sumGTRGAMMA_MIC(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+	double aTipVec[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    for(int k = 0; k < 16; k++)
+    {
+        for(int l = 0; l < 4; l++)
+        {
+            aTipVec[k*16 + l] = aTipVec[k*16 + 4 + l] = aTipVec[k*16 + 8 + l] = aTipVec[k*16 + 12 + l] = tipVector[k*4 + l];
+        }
+    }
+
+    switch(tipCase)
+    {
+      case PLL_TIP_TIP:
+      {
+        for(int i = 0; i < n; i++)
+        {
+            const double *left  = &(aTipVec[16 * tipX1[i]]);
+            const double *right = &(aTipVec[16 * tipX2[i]]);
+            double* sum = &sumtable[i * span];
+
+            #pragma ivdep
+            #pragma vector aligned nontemporal
+            for(int l = 0; l < span; l++)
+            {
+              sum[l] = left[l] * right[l];
+            }
+        }
+      } break;
+      case PLL_TIP_INNER:
+      {
+        for(int i = 0; i < n; i++)
+        {
+          _mm_prefetch((const char*) &x2_start[span*(i+32)], _MM_HINT_T1);
+          _mm_prefetch((const char*) &x2_start[span*(i+32) + 8], _MM_HINT_T1);
+
+          _mm_prefetch((const char*) &x2_start[span*(i+4)], _MM_HINT_T0);
+          _mm_prefetch((const char*) &x2_start[span*(i+4) + 8], _MM_HINT_T0);
+
+          const double *left = &(aTipVec[16 * tipX1[i]]);
+          const double *right = &(x2_start[span * i]);
+          double* sum = &sumtable[i * span];
+
+          #pragma ivdep
+          #pragma vector aligned nontemporal
+          for(int l = 0; l < span; l++)
+          {
+              sum[l] = left[l] * right[l];
+          }
+        }
+      } break;
+      case PLL_INNER_INNER:
+      {
+        for(int i = 0; i < n; i++)
+        {
+            _mm_prefetch((const char*) &x1_start[span*(i+32)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x1_start[span*(i+32) + 8], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2_start[span*(i+32)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2_start[span*(i+32) + 8], _MM_HINT_T1);
+
+            _mm_prefetch((const char*) &x1_start[span*(i+4)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x1_start[span*(i+4) + 8], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2_start[span*(i+4)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2_start[span*(i+4) + 8], _MM_HINT_T0);
+
+            const double *left  = &(x1_start[span * i]);
+            const double *right = &(x2_start[span * i]);
+            double* sum = &sumtable[i * span];
+
+            #pragma ivdep
+            #pragma vector aligned nontemporal
+            for(int l = 0; l < span; l++)
+            {
+                sum[l] = left[l] * right[l];
+            }
+        }
+      } break;
+  //    default:
+  //      assert(0);
+    }
+}
+
+void coreGTRGAMMA_MIC(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN, double *gammaRates, double lz, int *wgt)
+{
+	double diagptable0[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable1[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable2[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable01[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable02[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+    /* pre-compute the derivatives of the P matrix for all discrete GAMMA rates */
+
+    for(int i = 0; i < 4; i++)
+    {
+        const double ki = gammaRates[i];
+        const double kisqr = ki * ki;
+
+        diagptable0[i*4] = 1.;
+        diagptable1[i*4] = 0.;
+        diagptable2[i*4] = 0.;
+
+        for(int l = 1; l < states; l++)
+        {
+          diagptable0[i * 4 + l]  = exp(EIGN[l] * ki * lz);
+          diagptable1[i * 4 + l] = EIGN[l] * ki;
+          diagptable2[i * 4 + l] = EIGN[l] * EIGN[l] * kisqr;
+        }
+    }
+
+    #pragma ivdep
+    for(int i = 0; i < 16; i++)
+    {
+        diagptable01[i] = diagptable0[i] * diagptable1[i];
+        diagptable02[i] = diagptable0[i] * diagptable2[i];
+    }
+
+    /* loop over sites in this partition */
+
+    const int aligned_width = upper % 8 == 0 ? upper / 8 : upper / 8 + 1;
+
+    double dlnLBuf[8] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double d2lnLBuf[8] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    for (int j = 0; j < 8; ++j)
+    {
+        dlnLBuf[j] = 0.;
+        d2lnLBuf[j] = 0.;
+    }
+
+    __mmask16 k1 = _mm512_int2mask(0x000000FF);
+
+    for (int i = 0; i < aligned_width; i++)
+    {
+        _mm_prefetch((const char*) &sumtable[i * span * 8], _MM_HINT_T0);
+        _mm_prefetch((const char*) &sumtable[i * span * 8 + 8], _MM_HINT_T0);
+
+        /* access the array with pre-computed values */
+        const double *sum = &sumtable[i * span * 8];
+
+        /* initial per-site likelihood and 1st and 2nd derivatives */
+
+        double invBuf[8] __attribute__((align(64)));
+        double d1Buf[8] __attribute__((align(64)));
+        double d2Buf[8] __attribute__((align(64)));
+
+        __m512d invVec;
+        __m512d d1Vec;
+        __m512d d2Vec;
+        int mask = 0x01;
+
+        #pragma noprefetch sum
+        #pragma unroll(8)
+        for(int j = 0; j < 8; j++)
+        {
+            _mm_prefetch((const char*) &sum[span*(j+8)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &sum[span*(j+8) + 8], _MM_HINT_T1);
+
+            _mm_prefetch((const char*) &sum[span*(j+1)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &sum[span*(j+1) + 8], _MM_HINT_T0);
+
+            __m512d d0_1 = _mm512_load_pd(&diagptable0[0]);
+            __m512d d0_2 = _mm512_load_pd(&diagptable0[8]);
+
+            __m512d d01_1 = _mm512_load_pd(&diagptable01[0]);
+            __m512d d01_2 = _mm512_load_pd(&diagptable01[8]);
+
+            __m512d d02_1 = _mm512_load_pd(&diagptable02[0]);
+            __m512d d02_2 = _mm512_load_pd(&diagptable02[8]);
+
+            __m512d s_1 = _mm512_load_pd(&sum[j*16]);
+            __m512d s_2 = _mm512_load_pd(&sum[j*16 + 8]);
+            __m512d inv_1 = _mm512_mul_pd(d0_1, s_1);
+            __m512d d1_1 = _mm512_mul_pd(d01_1, s_1);
+            __m512d d2_1 = _mm512_mul_pd(d02_1, s_1);
+
+            __m512d inv_2 = _mm512_fmadd_pd(d0_2, s_2, inv_1);
+            __m512d d1_2 = _mm512_fmadd_pd(d01_2, s_2, d1_1);
+            __m512d d2_2 = _mm512_fmadd_pd(d02_2, s_2, d2_1);
+
+            __mmask8 k1 = _mm512_int2mask(mask);
+            mask <<= 1;
+
+            // reduce
+            inv_2 = _mm512_add_pd (inv_2, _mm512_swizzle_pd(inv_2, _MM_SWIZ_REG_CDAB));
+            inv_2 = _mm512_add_pd (inv_2, _mm512_swizzle_pd(inv_2, _MM_SWIZ_REG_BADC));
+            inv_2 = _mm512_add_pd (inv_2, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(inv_2), _MM_PERM_BADC)));
+            invVec = _mm512_mask_mov_pd(invVec, k1, inv_2);
+
+            d1_2 = _mm512_add_pd (d1_2, _mm512_swizzle_pd(d1_2, _MM_SWIZ_REG_CDAB));
+            d1_2 = _mm512_add_pd (d1_2, _mm512_swizzle_pd(d1_2, _MM_SWIZ_REG_BADC));
+            d1_2 = _mm512_add_pd (d1_2, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(d1_2), _MM_PERM_BADC)));
+            d1Vec = _mm512_mask_mov_pd(d1Vec, k1, d1_2);
+
+            d2_2 = _mm512_add_pd (d2_2, _mm512_swizzle_pd(d2_2, _MM_SWIZ_REG_CDAB));
+            d2_2 = _mm512_add_pd (d2_2, _mm512_swizzle_pd(d2_2, _MM_SWIZ_REG_BADC));
+            d2_2 = _mm512_add_pd (d2_2, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(d2_2), _MM_PERM_BADC)));
+            d2Vec = _mm512_mask_mov_pd(d2Vec, k1, d2_2);
+        }
+
+        _mm512_store_pd(&invBuf[0], invVec);
+        _mm512_store_pd(&d1Buf[0], d1Vec);
+        _mm512_store_pd(&d2Buf[0], d2Vec);
+
+        #pragma ivdep
+        #pragma vector aligned
+        for (int j = 0; j < 8; ++j)
+        {
+            const double inv_Li = 1.0 / invBuf[j];
+
+            const double d1 = d1Buf[j] * inv_Li;
+            const double d2 = d2Buf[j] * inv_Li;
+
+            dlnLBuf[j] += wgt[i * 8 + j] * d1;
+            d2lnLBuf[j] += wgt[i * 8 + j] * (d2 - d1 * d1);
+        }
+    } // site loop
+
+    double dlnLdlz = 0.;
+    double d2lnLdlz2 = 0.;
+    for (int j = 0; j < 8; ++j)
+    {
+        dlnLdlz += dlnLBuf[j];
+        d2lnLdlz2 += d2lnLBuf[j];
+    }
+
+    *ext_dlnLdlz   = dlnLdlz;
+    *ext_d2lnLdlz2 = d2lnLdlz2;
+}
diff --git a/pll/models.c b/pll/models.c
new file mode 100644
index 0000000..7bc24ef
--- /dev/null
+++ b/pll/models.c
@@ -0,0 +1,4377 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file models.c
+ *  
+ * @brief Model related code
+ *
+ * Detailed description to appear soon.
+ */ 
+
+
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h> 
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+
+extern const unsigned int bitVectorSecondary[256];
+extern const unsigned int bitVector32[33];
+extern const unsigned int bitVectorAA[23];
+extern const unsigned int bitVectorIdentity[256];
+
+extern const partitionLengths pLengths[PLL_MAX_MODEL];
+
+
+
+extern FILE *byteFile;
+
+
+
+
+
+
+
+
+
+/** @brief Hardcoded values for the WAG model
+  
+    Fill the \a ext_initialRates array with hardcoded substitution rates
+    of the WAG model.
+   
+    @param ext_initialRates
+      Where to place the substitution rates
+*/
+void putWAG(double *ext_initialRates)
+{ 
+  double
+    scaler,
+    q[20][20],
+    daa[400];
+
+  int 
+    i,
+    j,
+    r;
+
+  /* fill the triangle below the diagonal with values */
+  daa[ 1*20+ 0] =  55.15710; daa[ 2*20+ 0] =  50.98480; daa[ 2*20+ 1] =  63.53460; 
+  daa[ 3*20+ 0] =  73.89980; daa[ 3*20+ 1] =  14.73040; daa[ 3*20+ 2] = 542.94200; 
+  daa[ 4*20+ 0] = 102.70400; daa[ 4*20+ 1] =  52.81910; daa[ 4*20+ 2] =  26.52560; 
+  daa[ 4*20+ 3] =   3.02949; daa[ 5*20+ 0] =  90.85980; daa[ 5*20+ 1] = 303.55000; 
+  daa[ 5*20+ 2] = 154.36400; daa[ 5*20+ 3] =  61.67830; daa[ 5*20+ 4] =   9.88179; 
+  daa[ 6*20+ 0] = 158.28500; daa[ 6*20+ 1] =  43.91570; daa[ 6*20+ 2] =  94.71980; 
+  daa[ 6*20+ 3] = 617.41600; daa[ 6*20+ 4] =   2.13520; daa[ 6*20+ 5] = 546.94700; 
+  daa[ 7*20+ 0] = 141.67200; daa[ 7*20+ 1] =  58.46650; daa[ 7*20+ 2] = 112.55600; 
+  daa[ 7*20+ 3] =  86.55840; daa[ 7*20+ 4] =  30.66740; daa[ 7*20+ 5] =  33.00520; 
+  daa[ 7*20+ 6] =  56.77170; daa[ 8*20+ 0] =  31.69540; daa[ 8*20+ 1] = 213.71500; 
+  daa[ 8*20+ 2] = 395.62900; daa[ 8*20+ 3] =  93.06760; daa[ 8*20+ 4] =  24.89720; 
+  daa[ 8*20+ 5] = 429.41100; daa[ 8*20+ 6] =  57.00250; daa[ 8*20+ 7] =  24.94100; 
+  daa[ 9*20+ 0] =  19.33350; daa[ 9*20+ 1] =  18.69790; daa[ 9*20+ 2] =  55.42360; 
+  daa[ 9*20+ 3] =   3.94370; daa[ 9*20+ 4] =  17.01350; daa[ 9*20+ 5] =  11.39170; 
+  daa[ 9*20+ 6] =  12.73950; daa[ 9*20+ 7] =   3.04501; daa[ 9*20+ 8] =  13.81900; 
+  daa[10*20+ 0] =  39.79150; daa[10*20+ 1] =  49.76710; daa[10*20+ 2] =  13.15280; 
+  daa[10*20+ 3] =   8.48047; daa[10*20+ 4] =  38.42870; daa[10*20+ 5] =  86.94890; 
+  daa[10*20+ 6] =  15.42630; daa[10*20+ 7] =   6.13037; daa[10*20+ 8] =  49.94620; 
+  daa[10*20+ 9] = 317.09700; daa[11*20+ 0] =  90.62650; daa[11*20+ 1] = 535.14200; 
+  daa[11*20+ 2] = 301.20100; daa[11*20+ 3] =  47.98550; daa[11*20+ 4] =   7.40339; 
+  daa[11*20+ 5] = 389.49000; daa[11*20+ 6] = 258.44300; daa[11*20+ 7] =  37.35580; 
+  daa[11*20+ 8] =  89.04320; daa[11*20+ 9] =  32.38320; daa[11*20+10] =  25.75550; 
+  daa[12*20+ 0] =  89.34960; daa[12*20+ 1] =  68.31620; daa[12*20+ 2] =  19.82210; 
+  daa[12*20+ 3] =  10.37540; daa[12*20+ 4] =  39.04820; daa[12*20+ 5] = 154.52600; 
+  daa[12*20+ 6] =  31.51240; daa[12*20+ 7] =  17.41000; daa[12*20+ 8] =  40.41410; 
+  daa[12*20+ 9] = 425.74600; daa[12*20+10] = 485.40200; daa[12*20+11] =  93.42760; 
+  daa[13*20+ 0] =  21.04940; daa[13*20+ 1] =  10.27110; daa[13*20+ 2] =   9.61621; 
+  daa[13*20+ 3] =   4.67304; daa[13*20+ 4] =  39.80200; daa[13*20+ 5] =   9.99208; 
+  daa[13*20+ 6] =   8.11339; daa[13*20+ 7] =   4.99310; daa[13*20+ 8] =  67.93710; 
+  daa[13*20+ 9] = 105.94700; daa[13*20+10] = 211.51700; daa[13*20+11] =   8.88360; 
+  daa[13*20+12] = 119.06300; daa[14*20+ 0] = 143.85500; daa[14*20+ 1] =  67.94890; 
+  daa[14*20+ 2] =  19.50810; daa[14*20+ 3] =  42.39840; daa[14*20+ 4] =  10.94040; 
+  daa[14*20+ 5] =  93.33720; daa[14*20+ 6] =  68.23550; daa[14*20+ 7] =  24.35700; 
+  daa[14*20+ 8] =  69.61980; daa[14*20+ 9] =   9.99288; daa[14*20+10] =  41.58440; 
+  daa[14*20+11] =  55.68960; daa[14*20+12] =  17.13290; daa[14*20+13] =  16.14440; 
+  daa[15*20+ 0] = 337.07900; daa[15*20+ 1] = 122.41900; daa[15*20+ 2] = 397.42300; 
+  daa[15*20+ 3] = 107.17600; daa[15*20+ 4] = 140.76600; daa[15*20+ 5] = 102.88700; 
+  daa[15*20+ 6] =  70.49390; daa[15*20+ 7] = 134.18200; daa[15*20+ 8] =  74.01690; 
+  daa[15*20+ 9] =  31.94400; daa[15*20+10] =  34.47390; daa[15*20+11] =  96.71300; 
+  daa[15*20+12] =  49.39050; daa[15*20+13] =  54.59310; daa[15*20+14] = 161.32800; 
+  daa[16*20+ 0] = 212.11100; daa[16*20+ 1] =  55.44130; daa[16*20+ 2] = 203.00600; 
+  daa[16*20+ 3] =  37.48660; daa[16*20+ 4] =  51.29840; daa[16*20+ 5] =  85.79280; 
+  daa[16*20+ 6] =  82.27650; daa[16*20+ 7] =  22.58330; daa[16*20+ 8] =  47.33070; 
+  daa[16*20+ 9] = 145.81600; daa[16*20+10] =  32.66220; daa[16*20+11] = 138.69800; 
+  daa[16*20+12] = 151.61200; daa[16*20+13] =  17.19030; daa[16*20+14] =  79.53840; 
+  daa[16*20+15] = 437.80200; daa[17*20+ 0] =  11.31330; daa[17*20+ 1] = 116.39200; 
+  daa[17*20+ 2] =   7.19167; daa[17*20+ 3] =  12.97670; daa[17*20+ 4] =  71.70700; 
+  daa[17*20+ 5] =  21.57370; daa[17*20+ 6] =  15.65570; daa[17*20+ 7] =  33.69830; 
+  daa[17*20+ 8] =  26.25690; daa[17*20+ 9] =  21.24830; daa[17*20+10] =  66.53090; 
+  daa[17*20+11] =  13.75050; daa[17*20+12] =  51.57060; daa[17*20+13] = 152.96400; 
+  daa[17*20+14] =  13.94050; daa[17*20+15] =  52.37420; daa[17*20+16] =  11.08640; 
+  daa[18*20+ 0] =  24.07350; daa[18*20+ 1] =  38.15330; daa[18*20+ 2] = 108.60000; 
+  daa[18*20+ 3] =  32.57110; daa[18*20+ 4] =  54.38330; daa[18*20+ 5] =  22.77100; 
+  daa[18*20+ 6] =  19.63030; daa[18*20+ 7] =  10.36040; daa[18*20+ 8] = 387.34400; 
+  daa[18*20+ 9] =  42.01700; daa[18*20+10] =  39.86180; daa[18*20+11] =  13.32640; 
+  daa[18*20+12] =  42.84370; daa[18*20+13] = 645.42800; daa[18*20+14] =  21.60460; 
+  daa[18*20+15] =  78.69930; daa[18*20+16] =  29.11480; daa[18*20+17] = 248.53900; 
+  daa[19*20+ 0] = 200.60100; daa[19*20+ 1] =  25.18490; daa[19*20+ 2] =  19.62460; 
+  daa[19*20+ 3] =  15.23350; daa[19*20+ 4] = 100.21400; daa[19*20+ 5] =  30.12810; 
+  daa[19*20+ 6] =  58.87310; daa[19*20+ 7] =  18.72470; daa[19*20+ 8] =  11.83580; 
+  daa[19*20+ 9] = 782.13000; daa[19*20+10] = 180.03400; daa[19*20+11] =  30.54340; 
+  daa[19*20+12] = 205.84500; daa[19*20+13] =  64.98920; daa[19*20+14] =  31.48870; 
+  daa[19*20+15] =  23.27390; daa[19*20+16] = 138.82300; daa[19*20+17] =  36.53690; 
+  daa[19*20+18] =  31.47300; 
+
+  /* initialize a 20x20 matrix */
+  for(i = 0; i < 20; i++)
+    for(j = 0; j < 20; j++)
+      q[i][j] = 0.0;
+
+  /* fill the triangle above the diagonal with the corresponding values from the
+     lower triangle */
+  for (i=0; i<20; i++)  
+    for (j=0; j<i; j++)               
+      daa[j*20+i] = daa[i*20+j];
+
+  /* copy the triangle above the diagonal from daa (which is a linear block) to
+     the triangle above the diagonal of a square matrix q */
+  for(i = 0; i < 19; i++)
+    for(j = i + 1; j < 20; j++)      
+      q[i][j] = daa[i * 20 + j];
+
+  
+  /*
+    for (i=0; i<20; i++) 
+    {
+      for (j=0; j<20; j++)
+        printf("%1.2f ", q[i][j]);
+      printf("\n");
+    }
+    printf("\n");
+
+    printf("%f\n", q[18][19]);
+  */
+
+  /* create a scaler from the last value (last row last column) of the upper
+     triangle of q */
+  scaler = 1.0 / q[18][19];
+
+  
+
+  /* scale all values of the matrix */
+  for(i = 0; i < 19; i++)
+    for(j = i + 1; j < 20; j++)      
+      q[i][j] *= scaler;
+
+  /* copy the upper triangle of q to the linear array ext_initialRates */
+  for(i = 0, r = 0; i < 19; i++)          
+    for(j = i + 1; j < 20; j++)      
+      ext_initialRates[r++] = q[i][j];           
+      
+  /*
+    for (i=0; i<20; i++) 
+    {
+      for (j=0; j<20; j++)
+        printf("%1.2f ", q[i][j]);
+      printf("\n");
+    }
+    printf("\n");
+  */
+
+}
+
+
+
+/** @brief Initialize protein substitution rates matrix 
+  * 
+  * Initialize the array pointed to by \a ext_initialRates with the substitution
+  * rates of the corresponding protein model and set f to the appropriate
+  * stationary frequencies
+  *
+  * @param f
+  *   Array where to store the stationary frequency rates
+  *
+  * @param proteinMatrix
+  *   Which protein matrix to use  
+  *
+  * @param ext_initialRates
+      Where to store the retrieved substitution rates
+  *
+  * @param lg4_index
+  *   In case we are filling a substitution rates matrix of an LG4 model the index
+  *   specifies which of the four matrixes to use 
+  *
+*/
+static void initProtMat(double f[20], int proteinMatrix, double *ext_initialRates, int lg4_index)
+{ 
+  double q[20][20];
+  double daa[400], max, temp;
+  int i, j, r;
+  double *initialRates = ext_initialRates;
+  double scaler;
+
+  {
+      switch(proteinMatrix)
+        {
+        case PLL_DAYHOFF:
+          {     
+            daa[ 1*20+ 0] =   27.00; daa[ 2*20+ 0] =   98.00; daa[ 2*20+ 1] =   32.00; daa[ 3*20+ 0] =  120.00;
+            daa[ 3*20+ 1] =    0.00; daa[ 3*20+ 2] =  905.00; daa[ 4*20+ 0] =   36.00; daa[ 4*20+ 1] =   23.00;
+            daa[ 4*20+ 2] =    0.00; daa[ 4*20+ 3] =    0.00; daa[ 5*20+ 0] =   89.00; daa[ 5*20+ 1] =  246.00;
+            daa[ 5*20+ 2] =  103.00; daa[ 5*20+ 3] =  134.00; daa[ 5*20+ 4] =    0.00; daa[ 6*20+ 0] =  198.00;
+            daa[ 6*20+ 1] =    1.00; daa[ 6*20+ 2] =  148.00; daa[ 6*20+ 3] = 1153.00; daa[ 6*20+ 4] =    0.00;
+            daa[ 6*20+ 5] =  716.00; daa[ 7*20+ 0] =  240.00; daa[ 7*20+ 1] =    9.00; daa[ 7*20+ 2] =  139.00;
+            daa[ 7*20+ 3] =  125.00; daa[ 7*20+ 4] =   11.00; daa[ 7*20+ 5] =   28.00; daa[ 7*20+ 6] =   81.00;
+            daa[ 8*20+ 0] =   23.00; daa[ 8*20+ 1] =  240.00; daa[ 8*20+ 2] =  535.00; daa[ 8*20+ 3] =   86.00;
+            daa[ 8*20+ 4] =   28.00; daa[ 8*20+ 5] =  606.00; daa[ 8*20+ 6] =   43.00; daa[ 8*20+ 7] =   10.00;
+            daa[ 9*20+ 0] =   65.00; daa[ 9*20+ 1] =   64.00; daa[ 9*20+ 2] =   77.00; daa[ 9*20+ 3] =   24.00;
+            daa[ 9*20+ 4] =   44.00; daa[ 9*20+ 5] =   18.00; daa[ 9*20+ 6] =   61.00; daa[ 9*20+ 7] =    0.00;
+            daa[ 9*20+ 8] =    7.00; daa[10*20+ 0] =   41.00; daa[10*20+ 1] =   15.00; daa[10*20+ 2] =   34.00;
+            daa[10*20+ 3] =    0.00; daa[10*20+ 4] =    0.00; daa[10*20+ 5] =   73.00; daa[10*20+ 6] =   11.00;
+            daa[10*20+ 7] =    7.00; daa[10*20+ 8] =   44.00; daa[10*20+ 9] =  257.00; daa[11*20+ 0] =   26.00;
+            daa[11*20+ 1] =  464.00; daa[11*20+ 2] =  318.00; daa[11*20+ 3] =   71.00; daa[11*20+ 4] =    0.00;
+            daa[11*20+ 5] =  153.00; daa[11*20+ 6] =   83.00; daa[11*20+ 7] =   27.00; daa[11*20+ 8] =   26.00;
+            daa[11*20+ 9] =   46.00; daa[11*20+10] =   18.00; daa[12*20+ 0] =   72.00; daa[12*20+ 1] =   90.00;
+            daa[12*20+ 2] =    1.00; daa[12*20+ 3] =    0.00; daa[12*20+ 4] =    0.00; daa[12*20+ 5] =  114.00;
+            daa[12*20+ 6] =   30.00; daa[12*20+ 7] =   17.00; daa[12*20+ 8] =    0.00; daa[12*20+ 9] =  336.00;
+            daa[12*20+10] =  527.00; daa[12*20+11] =  243.00; daa[13*20+ 0] =   18.00; daa[13*20+ 1] =   14.00;
+            daa[13*20+ 2] =   14.00; daa[13*20+ 3] =    0.00; daa[13*20+ 4] =    0.00; daa[13*20+ 5] =    0.00;
+            daa[13*20+ 6] =    0.00; daa[13*20+ 7] =   15.00; daa[13*20+ 8] =   48.00; daa[13*20+ 9] =  196.00;
+            daa[13*20+10] =  157.00; daa[13*20+11] =    0.00; daa[13*20+12] =   92.00; daa[14*20+ 0] =  250.00;
+            daa[14*20+ 1] =  103.00; daa[14*20+ 2] =   42.00; daa[14*20+ 3] =   13.00; daa[14*20+ 4] =   19.00;
+            daa[14*20+ 5] =  153.00; daa[14*20+ 6] =   51.00; daa[14*20+ 7] =   34.00; daa[14*20+ 8] =   94.00;
+            daa[14*20+ 9] =   12.00; daa[14*20+10] =   32.00; daa[14*20+11] =   33.00; daa[14*20+12] =   17.00;
+            daa[14*20+13] =   11.00; daa[15*20+ 0] =  409.00; daa[15*20+ 1] =  154.00; daa[15*20+ 2] =  495.00;
+            daa[15*20+ 3] =   95.00; daa[15*20+ 4] =  161.00; daa[15*20+ 5] =   56.00; daa[15*20+ 6] =   79.00;
+            daa[15*20+ 7] =  234.00; daa[15*20+ 8] =   35.00; daa[15*20+ 9] =   24.00; daa[15*20+10] =   17.00;
+            daa[15*20+11] =   96.00; daa[15*20+12] =   62.00; daa[15*20+13] =   46.00; daa[15*20+14] =  245.00;
+            daa[16*20+ 0] =  371.00; daa[16*20+ 1] =   26.00; daa[16*20+ 2] =  229.00; daa[16*20+ 3] =   66.00;
+            daa[16*20+ 4] =   16.00; daa[16*20+ 5] =   53.00; daa[16*20+ 6] =   34.00; daa[16*20+ 7] =   30.00;
+            daa[16*20+ 8] =   22.00; daa[16*20+ 9] =  192.00; daa[16*20+10] =   33.00; daa[16*20+11] =  136.00;
+            daa[16*20+12] =  104.00; daa[16*20+13] =   13.00; daa[16*20+14] =   78.00; daa[16*20+15] =  550.00;
+            daa[17*20+ 0] =    0.00; daa[17*20+ 1] =  201.00; daa[17*20+ 2] =   23.00; daa[17*20+ 3] =    0.00;
+            daa[17*20+ 4] =    0.00; daa[17*20+ 5] =    0.00; daa[17*20+ 6] =    0.00; daa[17*20+ 7] =    0.00;
+            daa[17*20+ 8] =   27.00; daa[17*20+ 9] =    0.00; daa[17*20+10] =   46.00; daa[17*20+11] =    0.00;
+            daa[17*20+12] =    0.00; daa[17*20+13] =   76.00; daa[17*20+14] =    0.00; daa[17*20+15] =   75.00;
+            daa[17*20+16] =    0.00; daa[18*20+ 0] =   24.00; daa[18*20+ 1] =    8.00; daa[18*20+ 2] =   95.00;
+            daa[18*20+ 3] =    0.00; daa[18*20+ 4] =   96.00; daa[18*20+ 5] =    0.00; daa[18*20+ 6] =   22.00;
+            daa[18*20+ 7] =    0.00; daa[18*20+ 8] =  127.00; daa[18*20+ 9] =   37.00; daa[18*20+10] =   28.00;
+            daa[18*20+11] =   13.00; daa[18*20+12] =    0.00; daa[18*20+13] =  698.00; daa[18*20+14] =    0.00;
+            daa[18*20+15] =   34.00; daa[18*20+16] =   42.00; daa[18*20+17] =   61.00; daa[19*20+ 0] =  208.00;
+            daa[19*20+ 1] =   24.00; daa[19*20+ 2] =   15.00; daa[19*20+ 3] =   18.00; daa[19*20+ 4] =   49.00;
+            daa[19*20+ 5] =   35.00; daa[19*20+ 6] =   37.00; daa[19*20+ 7] =   54.00; daa[19*20+ 8] =   44.00;
+            daa[19*20+ 9] =  889.00; daa[19*20+10] =  175.00; daa[19*20+11] =   10.00; daa[19*20+12] =  258.00;
+            daa[19*20+13] =   12.00; daa[19*20+14] =   48.00; daa[19*20+15] =   30.00; daa[19*20+16] =  157.00;
+            daa[19*20+17] =    0.00; daa[19*20+18] =   28.00;               
+
+	    f[ 0] = 0.087127; f[ 1] = 0.040904; f[ 2] = 0.040432; f[ 3] = 0.046872;
+	    f[ 4] = 0.033474; f[ 5] = 0.038255; f[ 6] = 0.049530; f[ 7] = 0.088612;
+	    f[ 8] = 0.033618; f[ 9] = 0.036886; f[10] = 0.085357; f[11] = 0.080482;
+	    f[12] = 0.014753; f[13] = 0.039772; f[14] = 0.050680; f[15] = 0.069577;
+	    f[16] = 0.058542; f[17] = 0.010494; f[18] = 0.029916; f[19] = 0.064717;
+          }
+          break;
+        case PLL_DCMUT:
+          {     
+            daa[ 1*20+ 0] =   26.78280; daa[ 2*20+ 0] =   98.44740; daa[ 2*20+ 1] =   32.70590; daa[ 3*20+ 0] =  119.98050; 
+            daa[ 3*20+ 1] =    0.00000; daa[ 3*20+ 2] =  893.15150; daa[ 4*20+ 0] =   36.00160; daa[ 4*20+ 1] =   23.23740; 
+            daa[ 4*20+ 2] =    0.00000; daa[ 4*20+ 3] =    0.00000; daa[ 5*20+ 0] =   88.77530; daa[ 5*20+ 1] =  243.99390; 
+            daa[ 5*20+ 2] =  102.85090; daa[ 5*20+ 3] =  134.85510; daa[ 5*20+ 4] =    0.00000; daa[ 6*20+ 0] =  196.11670; 
+            daa[ 6*20+ 1] =    0.00000; daa[ 6*20+ 2] =  149.34090; daa[ 6*20+ 3] = 1138.86590; daa[ 6*20+ 4] =    0.00000; 
+            daa[ 6*20+ 5] =  708.60220; daa[ 7*20+ 0] =  238.61110; daa[ 7*20+ 1] =    8.77910; daa[ 7*20+ 2] =  138.53520; 
+            daa[ 7*20+ 3] =  124.09810; daa[ 7*20+ 4] =   10.72780; daa[ 7*20+ 5] =   28.15810; daa[ 7*20+ 6] =   81.19070; 
+            daa[ 8*20+ 0] =   22.81160; daa[ 8*20+ 1] =  238.31480; daa[ 8*20+ 2] =  529.00240; daa[ 8*20+ 3] =   86.82410; 
+            daa[ 8*20+ 4] =   28.27290; daa[ 8*20+ 5] =  601.16130; daa[ 8*20+ 6] =   43.94690; daa[ 8*20+ 7] =   10.68020; 
+            daa[ 9*20+ 0] =   65.34160; daa[ 9*20+ 1] =   63.26290; daa[ 9*20+ 2] =   76.80240; daa[ 9*20+ 3] =   23.92480; 
+            daa[ 9*20+ 4] =   43.80740; daa[ 9*20+ 5] =   18.03930; daa[ 9*20+ 6] =   60.95260; daa[ 9*20+ 7] =    0.00000; 
+            daa[ 9*20+ 8] =    7.69810; daa[10*20+ 0] =   40.64310; daa[10*20+ 1] =   15.49240; daa[10*20+ 2] =   34.11130; 
+            daa[10*20+ 3] =    0.00000; daa[10*20+ 4] =    0.00000; daa[10*20+ 5] =   73.07720; daa[10*20+ 6] =   11.28800; 
+            daa[10*20+ 7] =    7.15140; daa[10*20+ 8] =   44.35040; daa[10*20+ 9] =  255.66850; daa[11*20+ 0] =   25.86350; 
+            daa[11*20+ 1] =  461.01240; daa[11*20+ 2] =  314.83710; daa[11*20+ 3] =   71.69130; daa[11*20+ 4] =    0.00000; 
+            daa[11*20+ 5] =  151.90780; daa[11*20+ 6] =   83.00780; daa[11*20+ 7] =   26.76830; daa[11*20+ 8] =   27.04750; 
+            daa[11*20+ 9] =   46.08570; daa[11*20+10] =   18.06290; daa[12*20+ 0] =   71.78400; daa[12*20+ 1] =   89.63210; 
+            daa[12*20+ 2] =    0.00000; daa[12*20+ 3] =    0.00000; daa[12*20+ 4] =    0.00000; daa[12*20+ 5] =  112.74990; 
+            daa[12*20+ 6] =   30.48030; daa[12*20+ 7] =   17.03720; daa[12*20+ 8] =    0.00000; daa[12*20+ 9] =  333.27320; 
+            daa[12*20+10] =  523.01150; daa[12*20+11] =  241.17390; daa[13*20+ 0] =   18.36410; daa[13*20+ 1] =   13.69060; 
+            daa[13*20+ 2] =   13.85030; daa[13*20+ 3] =    0.00000; daa[13*20+ 4] =    0.00000; daa[13*20+ 5] =    0.00000; 
+            daa[13*20+ 6] =    0.00000; daa[13*20+ 7] =   15.34780; daa[13*20+ 8] =   47.59270; daa[13*20+ 9] =  195.19510; 
+            daa[13*20+10] =  156.51600; daa[13*20+11] =    0.00000; daa[13*20+12] =   92.18600; daa[14*20+ 0] =  248.59200; 
+            daa[14*20+ 1] =  102.83130; daa[14*20+ 2] =   41.92440; daa[14*20+ 3] =   13.39400; daa[14*20+ 4] =   18.75500; 
+            daa[14*20+ 5] =  152.61880; daa[14*20+ 6] =   50.70030; daa[14*20+ 7] =   34.71530; daa[14*20+ 8] =   93.37090; 
+            daa[14*20+ 9] =   11.91520; daa[14*20+10] =   31.62580; daa[14*20+11] =   33.54190; daa[14*20+12] =   17.02050; 
+            daa[14*20+13] =   11.05060; daa[15*20+ 0] =  405.18700; daa[15*20+ 1] =  153.15900; daa[15*20+ 2] =  488.58920; 
+            daa[15*20+ 3] =   95.60970; daa[15*20+ 4] =  159.83560; daa[15*20+ 5] =   56.18280; daa[15*20+ 6] =   79.39990; 
+            daa[15*20+ 7] =  232.22430; daa[15*20+ 8] =   35.36430; daa[15*20+ 9] =   24.79550; daa[15*20+10] =   17.14320; 
+            daa[15*20+11] =   95.45570; daa[15*20+12] =   61.99510; daa[15*20+13] =   45.99010; daa[15*20+14] =  242.72020; 
+            daa[16*20+ 0] =  368.03650; daa[16*20+ 1] =   26.57450; daa[16*20+ 2] =  227.16970; daa[16*20+ 3] =   66.09300; 
+            daa[16*20+ 4] =   16.23660; daa[16*20+ 5] =   52.56510; daa[16*20+ 6] =   34.01560; daa[16*20+ 7] =   30.66620; 
+            daa[16*20+ 8] =   22.63330; daa[16*20+ 9] =  190.07390; daa[16*20+10] =   33.10900; daa[16*20+11] =  135.05990; 
+            daa[16*20+12] =  103.15340; daa[16*20+13] =   13.66550; daa[16*20+14] =   78.28570; daa[16*20+15] =  543.66740; 
+            daa[17*20+ 0] =    0.00000; daa[17*20+ 1] =  200.13750; daa[17*20+ 2] =   22.49680; daa[17*20+ 3] =    0.00000; 
+            daa[17*20+ 4] =    0.00000; daa[17*20+ 5] =    0.00000; daa[17*20+ 6] =    0.00000; daa[17*20+ 7] =    0.00000; 
+            daa[17*20+ 8] =   27.05640; daa[17*20+ 9] =    0.00000; daa[17*20+10] =   46.17760; daa[17*20+11] =    0.00000; 
+            daa[17*20+12] =    0.00000; daa[17*20+13] =   76.23540; daa[17*20+14] =    0.00000; daa[17*20+15] =   74.08190; 
+            daa[17*20+16] =    0.00000; daa[18*20+ 0] =   24.41390; daa[18*20+ 1] =    7.80120; daa[18*20+ 2] =   94.69400; 
+            daa[18*20+ 3] =    0.00000; daa[18*20+ 4] =   95.31640; daa[18*20+ 5] =    0.00000; daa[18*20+ 6] =   21.47170; 
+            daa[18*20+ 7] =    0.00000; daa[18*20+ 8] =  126.54000; daa[18*20+ 9] =   37.48340; daa[18*20+10] =   28.65720; 
+            daa[18*20+11] =   13.21420; daa[18*20+12] =    0.00000; daa[18*20+13] =  695.26290; daa[18*20+14] =    0.00000; 
+            daa[18*20+15] =   33.62890; daa[18*20+16] =   41.78390; daa[18*20+17] =   60.80700; daa[19*20+ 0] =  205.95640; 
+            daa[19*20+ 1] =   24.03680; daa[19*20+ 2] =   15.80670; daa[19*20+ 3] =   17.83160; daa[19*20+ 4] =   48.46780; 
+            daa[19*20+ 5] =   34.69830; daa[19*20+ 6] =   36.72500; daa[19*20+ 7] =   53.81650; daa[19*20+ 8] =   43.87150; 
+            daa[19*20+ 9] =  881.00380; daa[19*20+10] =  174.51560; daa[19*20+11] =   10.38500; daa[19*20+12] =  256.59550; 
+            daa[19*20+13] =   12.36060; daa[19*20+14] =   48.50260; daa[19*20+15] =   30.38360; daa[19*20+16] =  156.19970; 
+            daa[19*20+17] =    0.00000; daa[19*20+18] =   27.93790;                
+
+	    f[ 0] = 0.087127; f[ 1] = 0.040904; f[ 2] = 0.040432; f[ 3] = 0.046872;
+	    f[ 4] = 0.033474; f[ 5] = 0.038255; f[ 6] = 0.049530; f[ 7] = 0.088612;
+	    f[ 8] = 0.033619; f[ 9] = 0.036886; f[10] = 0.085357; f[11] = 0.080481;
+	    f[12] = 0.014753; f[13] = 0.039772; f[14] = 0.050680; f[15] = 0.069577;
+	    f[16] = 0.058542; f[17] = 0.010494; f[18] = 0.029916; f[19] = 0.064717;
+
+	    f[ 0] = 0.087127; f[ 1] = 0.040904; f[ 2] = 0.040432; f[ 3] = 0.046872;
+	    f[ 4] = 0.033474; f[ 5] = 0.038255; f[ 6] = 0.049530; f[ 7] = 0.088612;
+	    f[ 8] = 0.033619; f[ 9] = 0.036886; f[10] = 0.085357; f[11] = 0.080481;
+	    f[12] = 0.014753; f[13] = 0.039772; f[14] = 0.050680; f[15] = 0.069577;
+	    f[16] = 0.058542; f[17] = 0.010494; f[18] = 0.029916; f[19] = 0.064717;
+
+          }
+          break;
+        case PLL_JTT:
+          {
+            daa[ 1*20+ 0] =   58.00; daa[ 2*20+ 0] =   54.00; daa[ 2*20+ 1] =   45.00; daa[ 3*20+ 0] =   81.00;
+            daa[ 3*20+ 1] =   16.00; daa[ 3*20+ 2] =  528.00; daa[ 4*20+ 0] =   56.00; daa[ 4*20+ 1] =  113.00;
+            daa[ 4*20+ 2] =   34.00; daa[ 4*20+ 3] =   10.00; daa[ 5*20+ 0] =   57.00; daa[ 5*20+ 1] =  310.00;
+            daa[ 5*20+ 2] =   86.00; daa[ 5*20+ 3] =   49.00; daa[ 5*20+ 4] =    9.00; daa[ 6*20+ 0] =  105.00;
+            daa[ 6*20+ 1] =   29.00; daa[ 6*20+ 2] =   58.00; daa[ 6*20+ 3] =  767.00; daa[ 6*20+ 4] =    5.00;
+            daa[ 6*20+ 5] =  323.00; daa[ 7*20+ 0] =  179.00; daa[ 7*20+ 1] =  137.00; daa[ 7*20+ 2] =   81.00;
+            daa[ 7*20+ 3] =  130.00; daa[ 7*20+ 4] =   59.00; daa[ 7*20+ 5] =   26.00; daa[ 7*20+ 6] =  119.00;
+            daa[ 8*20+ 0] =   27.00; daa[ 8*20+ 1] =  328.00; daa[ 8*20+ 2] =  391.00; daa[ 8*20+ 3] =  112.00;
+            daa[ 8*20+ 4] =   69.00; daa[ 8*20+ 5] =  597.00; daa[ 8*20+ 6] =   26.00; daa[ 8*20+ 7] =   23.00;
+            daa[ 9*20+ 0] =   36.00; daa[ 9*20+ 1] =   22.00; daa[ 9*20+ 2] =   47.00; daa[ 9*20+ 3] =   11.00;
+            daa[ 9*20+ 4] =   17.00; daa[ 9*20+ 5] =    9.00; daa[ 9*20+ 6] =   12.00; daa[ 9*20+ 7] =    6.00;
+            daa[ 9*20+ 8] =   16.00; daa[10*20+ 0] =   30.00; daa[10*20+ 1] =   38.00; daa[10*20+ 2] =   12.00;
+            daa[10*20+ 3] =    7.00; daa[10*20+ 4] =   23.00; daa[10*20+ 5] =   72.00; daa[10*20+ 6] =    9.00;
+            daa[10*20+ 7] =    6.00; daa[10*20+ 8] =   56.00; daa[10*20+ 9] =  229.00; daa[11*20+ 0] =   35.00;
+            daa[11*20+ 1] =  646.00; daa[11*20+ 2] =  263.00; daa[11*20+ 3] =   26.00; daa[11*20+ 4] =    7.00;
+            daa[11*20+ 5] =  292.00; daa[11*20+ 6] =  181.00; daa[11*20+ 7] =   27.00; daa[11*20+ 8] =   45.00;
+            daa[11*20+ 9] =   21.00; daa[11*20+10] =   14.00; daa[12*20+ 0] =   54.00; daa[12*20+ 1] =   44.00;
+            daa[12*20+ 2] =   30.00; daa[12*20+ 3] =   15.00; daa[12*20+ 4] =   31.00; daa[12*20+ 5] =   43.00;
+            daa[12*20+ 6] =   18.00; daa[12*20+ 7] =   14.00; daa[12*20+ 8] =   33.00; daa[12*20+ 9] =  479.00;
+            daa[12*20+10] =  388.00; daa[12*20+11] =   65.00; daa[13*20+ 0] =   15.00; daa[13*20+ 1] =    5.00;
+            daa[13*20+ 2] =   10.00; daa[13*20+ 3] =    4.00; daa[13*20+ 4] =   78.00; daa[13*20+ 5] =    4.00;
+            daa[13*20+ 6] =    5.00; daa[13*20+ 7] =    5.00; daa[13*20+ 8] =   40.00; daa[13*20+ 9] =   89.00;
+            daa[13*20+10] =  248.00; daa[13*20+11] =    4.00; daa[13*20+12] =   43.00; daa[14*20+ 0] =  194.00;
+            daa[14*20+ 1] =   74.00; daa[14*20+ 2] =   15.00; daa[14*20+ 3] =   15.00; daa[14*20+ 4] =   14.00;
+            daa[14*20+ 5] =  164.00; daa[14*20+ 6] =   18.00; daa[14*20+ 7] =   24.00; daa[14*20+ 8] =  115.00;
+            daa[14*20+ 9] =   10.00; daa[14*20+10] =  102.00; daa[14*20+11] =   21.00; daa[14*20+12] =   16.00;
+            daa[14*20+13] =   17.00; daa[15*20+ 0] =  378.00; daa[15*20+ 1] =  101.00; daa[15*20+ 2] =  503.00;
+            daa[15*20+ 3] =   59.00; daa[15*20+ 4] =  223.00; daa[15*20+ 5] =   53.00; daa[15*20+ 6] =   30.00;
+            daa[15*20+ 7] =  201.00; daa[15*20+ 8] =   73.00; daa[15*20+ 9] =   40.00; daa[15*20+10] =   59.00;
+            daa[15*20+11] =   47.00; daa[15*20+12] =   29.00; daa[15*20+13] =   92.00; daa[15*20+14] =  285.00;
+            daa[16*20+ 0] =  475.00; daa[16*20+ 1] =   64.00; daa[16*20+ 2] =  232.00; daa[16*20+ 3] =   38.00;
+            daa[16*20+ 4] =   42.00; daa[16*20+ 5] =   51.00; daa[16*20+ 6] =   32.00; daa[16*20+ 7] =   33.00;
+            daa[16*20+ 8] =   46.00; daa[16*20+ 9] =  245.00; daa[16*20+10] =   25.00; daa[16*20+11] =  103.00;
+            daa[16*20+12] =  226.00; daa[16*20+13] =   12.00; daa[16*20+14] =  118.00; daa[16*20+15] =  477.00;
+            daa[17*20+ 0] =    9.00; daa[17*20+ 1] =  126.00; daa[17*20+ 2] =    8.00; daa[17*20+ 3] =    4.00;
+            daa[17*20+ 4] =  115.00; daa[17*20+ 5] =   18.00; daa[17*20+ 6] =   10.00; daa[17*20+ 7] =   55.00;
+            daa[17*20+ 8] =    8.00; daa[17*20+ 9] =    9.00; daa[17*20+10] =   52.00; daa[17*20+11] =   10.00;
+            daa[17*20+12] =   24.00; daa[17*20+13] =   53.00; daa[17*20+14] =    6.00; daa[17*20+15] =   35.00;
+            daa[17*20+16] =   12.00; daa[18*20+ 0] =   11.00; daa[18*20+ 1] =   20.00; daa[18*20+ 2] =   70.00;
+            daa[18*20+ 3] =   46.00; daa[18*20+ 4] =  209.00; daa[18*20+ 5] =   24.00; daa[18*20+ 6] =    7.00;
+            daa[18*20+ 7] =    8.00; daa[18*20+ 8] =  573.00; daa[18*20+ 9] =   32.00; daa[18*20+10] =   24.00;
+            daa[18*20+11] =    8.00; daa[18*20+12] =   18.00; daa[18*20+13] =  536.00; daa[18*20+14] =   10.00;
+            daa[18*20+15] =   63.00; daa[18*20+16] =   21.00; daa[18*20+17] =   71.00; daa[19*20+ 0] =  298.00;
+            daa[19*20+ 1] =   17.00; daa[19*20+ 2] =   16.00; daa[19*20+ 3] =   31.00; daa[19*20+ 4] =   62.00;
+            daa[19*20+ 5] =   20.00; daa[19*20+ 6] =   45.00; daa[19*20+ 7] =   47.00; daa[19*20+ 8] =   11.00;
+            daa[19*20+ 9] =  961.00; daa[19*20+10] =  180.00; daa[19*20+11] =   14.00; daa[19*20+12] =  323.00;
+            daa[19*20+13] =   62.00; daa[19*20+14] =   23.00; daa[19*20+15] =   38.00; daa[19*20+16] =  112.00;
+            daa[19*20+17] =   25.00; daa[19*20+18] =   16.00;
+                    
+	    f[ 0] = 0.076748; f[ 1] = 0.051691; f[ 2] = 0.042645; f[ 3] = 0.051544;
+	    f[ 4] = 0.019803; f[ 5] = 0.040752; f[ 6] = 0.061830; f[ 7] = 0.073152;
+	    f[ 8] = 0.022944; f[ 9] = 0.053761; f[10] = 0.091904; f[11] = 0.058676;
+	    f[12] = 0.023826; f[13] = 0.040126; f[14] = 0.050901; f[15] = 0.068765;
+	    f[16] = 0.058565; f[17] = 0.014261; f[18] = 0.032102; f[19] = 0.066004;
+          }
+          break;
+        case  PLL_MTREV:
+          {
+            daa[ 1*20+ 0] =   23.18; daa[ 2*20+ 0] =   26.95; daa[ 2*20+ 1] =   13.24; daa[ 3*20+ 0] =   17.67;
+            daa[ 3*20+ 1] =    1.90; daa[ 3*20+ 2] =  794.38; daa[ 4*20+ 0] =   59.93; daa[ 4*20+ 1] =  103.33;
+            daa[ 4*20+ 2] =   58.94; daa[ 4*20+ 3] =    1.90; daa[ 5*20+ 0] =    1.90; daa[ 5*20+ 1] =  220.99;
+            daa[ 5*20+ 2] =  173.56; daa[ 5*20+ 3] =   55.28; daa[ 5*20+ 4] =   75.24; daa[ 6*20+ 0] =    9.77;
+            daa[ 6*20+ 1] =    1.90; daa[ 6*20+ 2] =   63.05; daa[ 6*20+ 3] =  583.55; daa[ 6*20+ 4] =    1.90;
+            daa[ 6*20+ 5] =  313.56; daa[ 7*20+ 0] =  120.71; daa[ 7*20+ 1] =   23.03; daa[ 7*20+ 2] =   53.30;
+            daa[ 7*20+ 3] =   56.77; daa[ 7*20+ 4] =   30.71; daa[ 7*20+ 5] =    6.75; daa[ 7*20+ 6] =   28.28;
+            daa[ 8*20+ 0] =   13.90; daa[ 8*20+ 1] =  165.23; daa[ 8*20+ 2] =  496.13; daa[ 8*20+ 3] =  113.99;
+            daa[ 8*20+ 4] =  141.49; daa[ 8*20+ 5] =  582.40; daa[ 8*20+ 6] =   49.12; daa[ 8*20+ 7] =    1.90;
+            daa[ 9*20+ 0] =   96.49; daa[ 9*20+ 1] =    1.90; daa[ 9*20+ 2] =   27.10; daa[ 9*20+ 3] =    4.34;
+            daa[ 9*20+ 4] =   62.73; daa[ 9*20+ 5] =    8.34; daa[ 9*20+ 6] =    3.31; daa[ 9*20+ 7] =    5.98;
+            daa[ 9*20+ 8] =   12.26; daa[10*20+ 0] =   25.46; daa[10*20+ 1] =   15.58; daa[10*20+ 2] =   15.16;
+            daa[10*20+ 3] =    1.90; daa[10*20+ 4] =   25.65; daa[10*20+ 5] =   39.70; daa[10*20+ 6] =    1.90;
+            daa[10*20+ 7] =    2.41; daa[10*20+ 8] =   11.49; daa[10*20+ 9] =  329.09; daa[11*20+ 0] =    8.36;
+            daa[11*20+ 1] =  141.40; daa[11*20+ 2] =  608.70; daa[11*20+ 3] =    2.31; daa[11*20+ 4] =    1.90;
+            daa[11*20+ 5] =  465.58; daa[11*20+ 6] =  313.86; daa[11*20+ 7] =   22.73; daa[11*20+ 8] =  127.67;
+            daa[11*20+ 9] =   19.57; daa[11*20+10] =   14.88; daa[12*20+ 0] =  141.88; daa[12*20+ 1] =    1.90;
+            daa[12*20+ 2] =   65.41; daa[12*20+ 3] =    1.90; daa[12*20+ 4] =    6.18; daa[12*20+ 5] =   47.37;
+            daa[12*20+ 6] =    1.90; daa[12*20+ 7] =    1.90; daa[12*20+ 8] =   11.97; daa[12*20+ 9] =  517.98;
+            daa[12*20+10] =  537.53; daa[12*20+11] =   91.37; daa[13*20+ 0] =    6.37; daa[13*20+ 1] =    4.69;
+            daa[13*20+ 2] =   15.20; daa[13*20+ 3] =    4.98; daa[13*20+ 4] =   70.80; daa[13*20+ 5] =   19.11;
+            daa[13*20+ 6] =    2.67; daa[13*20+ 7] =    1.90; daa[13*20+ 8] =   48.16; daa[13*20+ 9] =   84.67;
+            daa[13*20+10] =  216.06; daa[13*20+11] =    6.44; daa[13*20+12] =   90.82; daa[14*20+ 0] =   54.31;
+            daa[14*20+ 1] =   23.64; daa[14*20+ 2] =   73.31; daa[14*20+ 3] =   13.43; daa[14*20+ 4] =   31.26;
+            daa[14*20+ 5] =  137.29; daa[14*20+ 6] =   12.83; daa[14*20+ 7] =    1.90; daa[14*20+ 8] =   60.97;
+            daa[14*20+ 9] =   20.63; daa[14*20+10] =   40.10; daa[14*20+11] =   50.10; daa[14*20+12] =   18.84;
+            daa[14*20+13] =   17.31; daa[15*20+ 0] =  387.86; daa[15*20+ 1] =    6.04; daa[15*20+ 2] =  494.39;
+            daa[15*20+ 3] =   69.02; daa[15*20+ 4] =  277.05; daa[15*20+ 5] =   54.11; daa[15*20+ 6] =   54.71;
+            daa[15*20+ 7] =  125.93; daa[15*20+ 8] =   77.46; daa[15*20+ 9] =   47.70; daa[15*20+10] =   73.61;
+            daa[15*20+11] =  105.79; daa[15*20+12] =  111.16; daa[15*20+13] =   64.29; daa[15*20+14] =  169.90;
+            daa[16*20+ 0] =  480.72; daa[16*20+ 1] =    2.08; daa[16*20+ 2] =  238.46; daa[16*20+ 3] =   28.01;
+            daa[16*20+ 4] =  179.97; daa[16*20+ 5] =   94.93; daa[16*20+ 6] =   14.82; daa[16*20+ 7] =   11.17;
+            daa[16*20+ 8] =   44.78; daa[16*20+ 9] =  368.43; daa[16*20+10] =  126.40; daa[16*20+11] =  136.33;
+            daa[16*20+12] =  528.17; daa[16*20+13] =   33.85; daa[16*20+14] =  128.22; daa[16*20+15] =  597.21;
+            daa[17*20+ 0] =    1.90; daa[17*20+ 1] =   21.95; daa[17*20+ 2] =   10.68; daa[17*20+ 3] =   19.86;
+            daa[17*20+ 4] =   33.60; daa[17*20+ 5] =    1.90; daa[17*20+ 6] =    1.90; daa[17*20+ 7] =   10.92;
+            daa[17*20+ 8] =    7.08; daa[17*20+ 9] =    1.90; daa[17*20+10] =   32.44; daa[17*20+11] =   24.00;
+            daa[17*20+12] =   21.71; daa[17*20+13] =    7.84; daa[17*20+14] =    4.21; daa[17*20+15] =   38.58;
+            daa[17*20+16] =    9.99; daa[18*20+ 0] =    6.48; daa[18*20+ 1] =    1.90; daa[18*20+ 2] =  191.36;
+            daa[18*20+ 3] =   21.21; daa[18*20+ 4] =  254.77; daa[18*20+ 5] =   38.82; daa[18*20+ 6] =   13.12;
+            daa[18*20+ 7] =    3.21; daa[18*20+ 8] =  670.14; daa[18*20+ 9] =   25.01; daa[18*20+10] =   44.15;
+            daa[18*20+11] =   51.17; daa[18*20+12] =   39.96; daa[18*20+13] =  465.58; daa[18*20+14] =   16.21;
+            daa[18*20+15] =   64.92; daa[18*20+16] =   38.73; daa[18*20+17] =   26.25; daa[19*20+ 0] =  195.06;
+            daa[19*20+ 1] =    7.64; daa[19*20+ 2] =    1.90; daa[19*20+ 3] =    1.90; daa[19*20+ 4] =    1.90;
+            daa[19*20+ 5] =   19.00; daa[19*20+ 6] =   21.14; daa[19*20+ 7] =    2.53; daa[19*20+ 8] =    1.90;
+            daa[19*20+ 9] = 1222.94; daa[19*20+10] =   91.67; daa[19*20+11] =    1.90; daa[19*20+12] =  387.54;
+            daa[19*20+13] =    6.35; daa[19*20+14] =    8.23; daa[19*20+15] =    1.90; daa[19*20+16] =  204.54;
+            daa[19*20+17] =    5.37; daa[19*20+18] =    1.90;
+            
+            
+            f[ 0] = 0.072000; f[ 1] = 0.019000; f[ 2] = 0.039000; f[ 3] = 0.019000;
+            f[ 4] = 0.006000; f[ 5] = 0.025000; f[ 6] = 0.024000; f[ 7] = 0.056000;
+            f[ 8] = 0.028000; f[ 9] = 0.088000; f[10] = 0.169000; f[11] = 0.023000;
+            f[12] = 0.054000; f[13] = 0.061000; f[14] = 0.054000; f[15] = 0.072000;
+            f[16] = 0.086000; f[17] = 0.029000; f[18] = 0.033000; f[19] = 0.043000;
+          }
+          break;
+        case PLL_WAG:
+          {
+            daa[ 1*20+ 0] =  55.15710; daa[ 2*20+ 0] =  50.98480; daa[ 2*20+ 1] =  63.53460; 
+            daa[ 3*20+ 0] =  73.89980; daa[ 3*20+ 1] =  14.73040; daa[ 3*20+ 2] = 542.94200; 
+            daa[ 4*20+ 0] = 102.70400; daa[ 4*20+ 1] =  52.81910; daa[ 4*20+ 2] =  26.52560; 
+            daa[ 4*20+ 3] =   3.02949; daa[ 5*20+ 0] =  90.85980; daa[ 5*20+ 1] = 303.55000; 
+            daa[ 5*20+ 2] = 154.36400; daa[ 5*20+ 3] =  61.67830; daa[ 5*20+ 4] =   9.88179; 
+            daa[ 6*20+ 0] = 158.28500; daa[ 6*20+ 1] =  43.91570; daa[ 6*20+ 2] =  94.71980; 
+            daa[ 6*20+ 3] = 617.41600; daa[ 6*20+ 4] =   2.13520; daa[ 6*20+ 5] = 546.94700; 
+            daa[ 7*20+ 0] = 141.67200; daa[ 7*20+ 1] =  58.46650; daa[ 7*20+ 2] = 112.55600; 
+            daa[ 7*20+ 3] =  86.55840; daa[ 7*20+ 4] =  30.66740; daa[ 7*20+ 5] =  33.00520; 
+            daa[ 7*20+ 6] =  56.77170; daa[ 8*20+ 0] =  31.69540; daa[ 8*20+ 1] = 213.71500; 
+            daa[ 8*20+ 2] = 395.62900; daa[ 8*20+ 3] =  93.06760; daa[ 8*20+ 4] =  24.89720; 
+            daa[ 8*20+ 5] = 429.41100; daa[ 8*20+ 6] =  57.00250; daa[ 8*20+ 7] =  24.94100; 
+            daa[ 9*20+ 0] =  19.33350; daa[ 9*20+ 1] =  18.69790; daa[ 9*20+ 2] =  55.42360; 
+            daa[ 9*20+ 3] =   3.94370; daa[ 9*20+ 4] =  17.01350; daa[ 9*20+ 5] =  11.39170; 
+            daa[ 9*20+ 6] =  12.73950; daa[ 9*20+ 7] =   3.04501; daa[ 9*20+ 8] =  13.81900; 
+            daa[10*20+ 0] =  39.79150; daa[10*20+ 1] =  49.76710; daa[10*20+ 2] =  13.15280; 
+            daa[10*20+ 3] =   8.48047; daa[10*20+ 4] =  38.42870; daa[10*20+ 5] =  86.94890; 
+            daa[10*20+ 6] =  15.42630; daa[10*20+ 7] =   6.13037; daa[10*20+ 8] =  49.94620; 
+            daa[10*20+ 9] = 317.09700; daa[11*20+ 0] =  90.62650; daa[11*20+ 1] = 535.14200; 
+            daa[11*20+ 2] = 301.20100; daa[11*20+ 3] =  47.98550; daa[11*20+ 4] =   7.40339; 
+            daa[11*20+ 5] = 389.49000; daa[11*20+ 6] = 258.44300; daa[11*20+ 7] =  37.35580; 
+            daa[11*20+ 8] =  89.04320; daa[11*20+ 9] =  32.38320; daa[11*20+10] =  25.75550; 
+            daa[12*20+ 0] =  89.34960; daa[12*20+ 1] =  68.31620; daa[12*20+ 2] =  19.82210; 
+            daa[12*20+ 3] =  10.37540; daa[12*20+ 4] =  39.04820; daa[12*20+ 5] = 154.52600; 
+            daa[12*20+ 6] =  31.51240; daa[12*20+ 7] =  17.41000; daa[12*20+ 8] =  40.41410; 
+            daa[12*20+ 9] = 425.74600; daa[12*20+10] = 485.40200; daa[12*20+11] =  93.42760; 
+            daa[13*20+ 0] =  21.04940; daa[13*20+ 1] =  10.27110; daa[13*20+ 2] =   9.61621; 
+            daa[13*20+ 3] =   4.67304; daa[13*20+ 4] =  39.80200; daa[13*20+ 5] =   9.99208; 
+            daa[13*20+ 6] =   8.11339; daa[13*20+ 7] =   4.99310; daa[13*20+ 8] =  67.93710; 
+            daa[13*20+ 9] = 105.94700; daa[13*20+10] = 211.51700; daa[13*20+11] =   8.88360; 
+            daa[13*20+12] = 119.06300; daa[14*20+ 0] = 143.85500; daa[14*20+ 1] =  67.94890; 
+            daa[14*20+ 2] =  19.50810; daa[14*20+ 3] =  42.39840; daa[14*20+ 4] =  10.94040; 
+            daa[14*20+ 5] =  93.33720; daa[14*20+ 6] =  68.23550; daa[14*20+ 7] =  24.35700; 
+            daa[14*20+ 8] =  69.61980; daa[14*20+ 9] =   9.99288; daa[14*20+10] =  41.58440; 
+            daa[14*20+11] =  55.68960; daa[14*20+12] =  17.13290; daa[14*20+13] =  16.14440; 
+            daa[15*20+ 0] = 337.07900; daa[15*20+ 1] = 122.41900; daa[15*20+ 2] = 397.42300; 
+            daa[15*20+ 3] = 107.17600; daa[15*20+ 4] = 140.76600; daa[15*20+ 5] = 102.88700; 
+            daa[15*20+ 6] =  70.49390; daa[15*20+ 7] = 134.18200; daa[15*20+ 8] =  74.01690; 
+            daa[15*20+ 9] =  31.94400; daa[15*20+10] =  34.47390; daa[15*20+11] =  96.71300; 
+            daa[15*20+12] =  49.39050; daa[15*20+13] =  54.59310; daa[15*20+14] = 161.32800; 
+            daa[16*20+ 0] = 212.11100; daa[16*20+ 1] =  55.44130; daa[16*20+ 2] = 203.00600; 
+            daa[16*20+ 3] =  37.48660; daa[16*20+ 4] =  51.29840; daa[16*20+ 5] =  85.79280; 
+            daa[16*20+ 6] =  82.27650; daa[16*20+ 7] =  22.58330; daa[16*20+ 8] =  47.33070; 
+            daa[16*20+ 9] = 145.81600; daa[16*20+10] =  32.66220; daa[16*20+11] = 138.69800; 
+            daa[16*20+12] = 151.61200; daa[16*20+13] =  17.19030; daa[16*20+14] =  79.53840; 
+            daa[16*20+15] = 437.80200; daa[17*20+ 0] =  11.31330; daa[17*20+ 1] = 116.39200; 
+            daa[17*20+ 2] =   7.19167; daa[17*20+ 3] =  12.97670; daa[17*20+ 4] =  71.70700; 
+            daa[17*20+ 5] =  21.57370; daa[17*20+ 6] =  15.65570; daa[17*20+ 7] =  33.69830; 
+            daa[17*20+ 8] =  26.25690; daa[17*20+ 9] =  21.24830; daa[17*20+10] =  66.53090; 
+            daa[17*20+11] =  13.75050; daa[17*20+12] =  51.57060; daa[17*20+13] = 152.96400; 
+            daa[17*20+14] =  13.94050; daa[17*20+15] =  52.37420; daa[17*20+16] =  11.08640; 
+            daa[18*20+ 0] =  24.07350; daa[18*20+ 1] =  38.15330; daa[18*20+ 2] = 108.60000; 
+            daa[18*20+ 3] =  32.57110; daa[18*20+ 4] =  54.38330; daa[18*20+ 5] =  22.77100; 
+            daa[18*20+ 6] =  19.63030; daa[18*20+ 7] =  10.36040; daa[18*20+ 8] = 387.34400; 
+            daa[18*20+ 9] =  42.01700; daa[18*20+10] =  39.86180; daa[18*20+11] =  13.32640; 
+            daa[18*20+12] =  42.84370; daa[18*20+13] = 645.42800; daa[18*20+14] =  21.60460; 
+            daa[18*20+15] =  78.69930; daa[18*20+16] =  29.11480; daa[18*20+17] = 248.53900; 
+            daa[19*20+ 0] = 200.60100; daa[19*20+ 1] =  25.18490; daa[19*20+ 2] =  19.62460; 
+            daa[19*20+ 3] =  15.23350; daa[19*20+ 4] = 100.21400; daa[19*20+ 5] =  30.12810; 
+            daa[19*20+ 6] =  58.87310; daa[19*20+ 7] =  18.72470; daa[19*20+ 8] =  11.83580; 
+            daa[19*20+ 9] = 782.13000; daa[19*20+10] = 180.03400; daa[19*20+11] =  30.54340; 
+            daa[19*20+12] = 205.84500; daa[19*20+13] =  64.98920; daa[19*20+14] =  31.48870; 
+            daa[19*20+15] =  23.27390; daa[19*20+16] = 138.82300; daa[19*20+17] =  36.53690; 
+            daa[19*20+18] =  31.47300; 
+                   
+	    f[0]  = 0.0866279; f[1]  = 0.043972;  f[2]  = 0.0390894; f[3]  = 0.0570451;
+	    f[4]  = 0.0193078; f[5]  = 0.0367281; f[6]  = 0.0580589; f[7]  = 0.0832518;
+	    f[8]  = 0.0244313; f[9]  = 0.048466;  f[10] = 0.086209;  f[11] = 0.0620286;
+	    f[12] = 0.0195027; f[13] = 0.0384319; f[14] = 0.0457631; f[15] = 0.0695179;
+	    f[16] = 0.0610127; f[17] = 0.0143859; f[18] = 0.0352742; f[19] = 0.0708957;   
+          }
+          break;
+        case PLL_RTREV:
+          {
+            daa[1*20+0]= 34;         daa[2*20+0]= 51;         daa[2*20+1]= 35;         daa[3*20+0]= 10;         
+            daa[3*20+1]= 30;         daa[3*20+2]= 384;        daa[4*20+0]= 439;        daa[4*20+1]= 92;         
+            daa[4*20+2]= 128;        daa[4*20+3]= 1;          daa[5*20+0]= 32;         daa[5*20+1]= 221;        
+            daa[5*20+2]= 236;        daa[5*20+3]= 78;         daa[5*20+4]= 70;         daa[6*20+0]= 81;         
+            daa[6*20+1]= 10;         daa[6*20+2]= 79;         daa[6*20+3]= 542;        daa[6*20+4]= 1;          
+            daa[6*20+5]= 372;        daa[7*20+0]= 135;        daa[7*20+1]= 41;         daa[7*20+2]= 94;         
+            daa[7*20+3]= 61;         daa[7*20+4]= 48;         daa[7*20+5]= 18;         daa[7*20+6]= 70;         
+            daa[8*20+0]= 30;         daa[8*20+1]= 90;         daa[8*20+2]= 320;        daa[8*20+3]= 91;         
+            daa[8*20+4]= 124;        daa[8*20+5]= 387;        daa[8*20+6]= 34;         daa[8*20+7]= 68;         
+            daa[9*20+0]= 1;          daa[9*20+1]= 24;         daa[9*20+2]= 35;         daa[9*20+3]= 1;          
+            daa[9*20+4]= 104;        daa[9*20+5]= 33;         daa[9*20+6]= 1;          daa[9*20+7]= 1;          
+            daa[9*20+8]= 34;         daa[10*20+0]= 45;        daa[10*20+1]= 18;        daa[10*20+2]= 15;        
+            daa[10*20+3]= 5;         daa[10*20+4]= 110;       daa[10*20+5]= 54;        daa[10*20+6]= 21;        
+            daa[10*20+7]= 3;         daa[10*20+8]= 51;        daa[10*20+9]= 385;       daa[11*20+0]= 38;        
+            daa[11*20+1]= 593;       daa[11*20+2]= 123;       daa[11*20+3]= 20;        daa[11*20+4]= 16;        
+            daa[11*20+5]= 309;       daa[11*20+6]= 141;       daa[11*20+7]= 30;        daa[11*20+8]= 76;        
+            daa[11*20+9]= 34;        daa[11*20+10]= 23;       daa[12*20+0]= 235;       daa[12*20+1]= 57;        
+            daa[12*20+2]= 1;         daa[12*20+3]= 1;         daa[12*20+4]= 156;       daa[12*20+5]= 158;       
+            daa[12*20+6]= 1;         daa[12*20+7]= 37;        daa[12*20+8]= 116;       daa[12*20+9]= 375;       
+            daa[12*20+10]= 581;      daa[12*20+11]= 134;      daa[13*20+0]= 1;         daa[13*20+1]= 7;         
+            daa[13*20+2]= 49;        daa[13*20+3]= 1;         daa[13*20+4]= 70;        daa[13*20+5]= 1;         
+            daa[13*20+6]= 1;         daa[13*20+7]= 7;         daa[13*20+8]= 141;       daa[13*20+9]= 64;        
+            daa[13*20+10]= 179;      daa[13*20+11]= 14;       daa[13*20+12]= 247;      daa[14*20+0]= 97;        
+            daa[14*20+1]= 24;        daa[14*20+2]= 33;        daa[14*20+3]= 55;        daa[14*20+4]= 1;         
+            daa[14*20+5]= 68;        daa[14*20+6]= 52;        daa[14*20+7]= 17;        daa[14*20+8]= 44;        
+            daa[14*20+9]= 10;        daa[14*20+10]= 22;       daa[14*20+11]= 43;       daa[14*20+12]= 1;        
+            daa[14*20+13]= 11;       daa[15*20+0]= 460;       daa[15*20+1]= 102;       daa[15*20+2]= 294;       
+            daa[15*20+3]= 136;       daa[15*20+4]= 75;        daa[15*20+5]= 225;       daa[15*20+6]= 95;        
+            daa[15*20+7]= 152;       daa[15*20+8]= 183;       daa[15*20+9]= 4;         daa[15*20+10]= 24;       
+            daa[15*20+11]= 77;       daa[15*20+12]= 1;        daa[15*20+13]= 20;       daa[15*20+14]= 134;      
+            daa[16*20+0]= 258;       daa[16*20+1]= 64;        daa[16*20+2]= 148;       daa[16*20+3]= 55;        
+            daa[16*20+4]= 117;       daa[16*20+5]= 146;       daa[16*20+6]= 82;        daa[16*20+7]= 7;         
+            daa[16*20+8]= 49;        daa[16*20+9]= 72;        daa[16*20+10]= 25;       daa[16*20+11]= 110;      
+            daa[16*20+12]= 131;      daa[16*20+13]= 69;       daa[16*20+14]= 62;       daa[16*20+15]= 671;      
+            daa[17*20+0]= 5;         daa[17*20+1]= 13;        daa[17*20+2]= 16;        daa[17*20+3]= 1;         
+            daa[17*20+4]= 55;        daa[17*20+5]= 10;        daa[17*20+6]= 17;        daa[17*20+7]= 23;        
+            daa[17*20+8]= 48;        daa[17*20+9]= 39;        daa[17*20+10]= 47;       daa[17*20+11]= 6;        
+            daa[17*20+12]= 111;      daa[17*20+13]= 182;      daa[17*20+14]= 9;        daa[17*20+15]= 14;       
+            daa[17*20+16]= 1;        daa[18*20+0]= 55;        daa[18*20+1]= 47;        daa[18*20+2]= 28;        
+            daa[18*20+3]= 1;         daa[18*20+4]= 131;       daa[18*20+5]= 45;        daa[18*20+6]= 1;         
+            daa[18*20+7]= 21;        daa[18*20+8]= 307;       daa[18*20+9]= 26;        daa[18*20+10]= 64;       
+            daa[18*20+11]= 1;        daa[18*20+12]= 74;       daa[18*20+13]= 1017;     daa[18*20+14]= 14;       
+            daa[18*20+15]= 31;       daa[18*20+16]= 34;       daa[18*20+17]= 176;      daa[19*20+0]= 197;       
+            daa[19*20+1]= 29;        daa[19*20+2]= 21;        daa[19*20+3]= 6;         daa[19*20+4]= 295;       
+            daa[19*20+5]= 36;        daa[19*20+6]= 35;        daa[19*20+7]= 3;         daa[19*20+8]= 1;         
+            daa[19*20+9]= 1048;      daa[19*20+10]= 112;      daa[19*20+11]= 19;       daa[19*20+12]= 236;      
+            daa[19*20+13]= 92;       daa[19*20+14]= 25;       daa[19*20+15]= 39;       daa[19*20+16]= 196;      
+            daa[19*20+17]= 26;       daa[19*20+18]= 59;       
+            
+            f[0]= 0.0646;           f[1]= 0.0453;           f[2]= 0.0376;           f[3]= 0.0422;           
+            f[4]= 0.0114;           f[5]= 0.0606;           f[6]= 0.0607;           f[7]= 0.0639;           
+            f[8]= 0.0273;           f[9]= 0.0679;           f[10]= 0.1018;          f[11]= 0.0751;          
+            f[12]= 0.015;           f[13]= 0.0287;          f[14]= 0.0681;          f[15]= 0.0488;          
+            f[16]= 0.0622;          f[17]= 0.0251;          f[18]= 0.0318;          f[19]= 0.0619;                  
+          }
+          break;
+        case PLL_CPREV:
+          {
+            daa[1*20+0]= 105;        daa[2*20+0]= 227;        daa[2*20+1]= 357;        daa[3*20+0]= 175;        
+            daa[3*20+1]= 43;         daa[3*20+2]= 4435;       daa[4*20+0]= 669;        daa[4*20+1]= 823;        
+            daa[4*20+2]= 538;        daa[4*20+3]= 10;         daa[5*20+0]= 157;        daa[5*20+1]= 1745;       
+            daa[5*20+2]= 768;        daa[5*20+3]= 400;        daa[5*20+4]= 10;         daa[6*20+0]= 499;        
+            daa[6*20+1]= 152;        daa[6*20+2]= 1055;       daa[6*20+3]= 3691;       daa[6*20+4]= 10;         
+            daa[6*20+5]= 3122;       daa[7*20+0]= 665;        daa[7*20+1]= 243;        daa[7*20+2]= 653;        
+            daa[7*20+3]= 431;        daa[7*20+4]= 303;        daa[7*20+5]= 133;        daa[7*20+6]= 379;        
+            daa[8*20+0]= 66;         daa[8*20+1]= 715;        daa[8*20+2]= 1405;       daa[8*20+3]= 331;        
+            daa[8*20+4]= 441;        daa[8*20+5]= 1269;       daa[8*20+6]= 162;        daa[8*20+7]= 19;         
+            daa[9*20+0]= 145;        daa[9*20+1]= 136;        daa[9*20+2]= 168;        daa[9*20+3]= 10;         
+            daa[9*20+4]= 280;        daa[9*20+5]= 92;         daa[9*20+6]= 148;        daa[9*20+7]= 40;         
+            daa[9*20+8]= 29;         daa[10*20+0]= 197;       daa[10*20+1]= 203;       daa[10*20+2]= 113;       
+            daa[10*20+3]= 10;        daa[10*20+4]= 396;       daa[10*20+5]= 286;       daa[10*20+6]= 82;        
+            daa[10*20+7]= 20;        daa[10*20+8]= 66;        daa[10*20+9]= 1745;      daa[11*20+0]= 236;       
+            daa[11*20+1]= 4482;      daa[11*20+2]= 2430;      daa[11*20+3]= 412;       daa[11*20+4]= 48;        
+            daa[11*20+5]= 3313;      daa[11*20+6]= 2629;      daa[11*20+7]= 263;       daa[11*20+8]= 305;       
+            daa[11*20+9]= 345;       daa[11*20+10]= 218;      daa[12*20+0]= 185;       daa[12*20+1]= 125;       
+            daa[12*20+2]= 61;        daa[12*20+3]= 47;        daa[12*20+4]= 159;       daa[12*20+5]= 202;       
+            daa[12*20+6]= 113;       daa[12*20+7]= 21;        daa[12*20+8]= 10;        daa[12*20+9]= 1772;      
+            daa[12*20+10]= 1351;     daa[12*20+11]= 193;      daa[13*20+0]= 68;        daa[13*20+1]= 53;        
+            daa[13*20+2]= 97;        daa[13*20+3]= 22;        daa[13*20+4]= 726;       daa[13*20+5]= 10;        
+            daa[13*20+6]= 145;       daa[13*20+7]= 25;        daa[13*20+8]= 127;       daa[13*20+9]= 454;       
+            daa[13*20+10]= 1268;     daa[13*20+11]= 72;       daa[13*20+12]= 327;      daa[14*20+0]= 490;       
+            daa[14*20+1]= 87;        daa[14*20+2]= 173;       daa[14*20+3]= 170;       daa[14*20+4]= 285;       
+            daa[14*20+5]= 323;       daa[14*20+6]= 185;       daa[14*20+7]= 28;        daa[14*20+8]= 152;       
+            daa[14*20+9]= 117;       daa[14*20+10]= 219;      daa[14*20+11]= 302;      daa[14*20+12]= 100;      
+            daa[14*20+13]= 43;       daa[15*20+0]= 2440;      daa[15*20+1]= 385;       daa[15*20+2]= 2085;      
+            daa[15*20+3]= 590;       daa[15*20+4]= 2331;      daa[15*20+5]= 396;       daa[15*20+6]= 568;       
+            daa[15*20+7]= 691;       daa[15*20+8]= 303;       daa[15*20+9]= 216;       daa[15*20+10]= 516;      
+            daa[15*20+11]= 868;      daa[15*20+12]= 93;       daa[15*20+13]= 487;      daa[15*20+14]= 1202;     
+            daa[16*20+0]= 1340;      daa[16*20+1]= 314;       daa[16*20+2]= 1393;      daa[16*20+3]= 266;       
+            daa[16*20+4]= 576;       daa[16*20+5]= 241;       daa[16*20+6]= 369;       daa[16*20+7]= 92;        
+            daa[16*20+8]= 32;        daa[16*20+9]= 1040;      daa[16*20+10]= 156;      daa[16*20+11]= 918;      
+            daa[16*20+12]= 645;      daa[16*20+13]= 148;      daa[16*20+14]= 260;      daa[16*20+15]= 2151;     
+            daa[17*20+0]= 14;        daa[17*20+1]= 230;       daa[17*20+2]= 40;        daa[17*20+3]= 18;        
+            daa[17*20+4]= 435;       daa[17*20+5]= 53;        daa[17*20+6]= 63;        daa[17*20+7]= 82;        
+            daa[17*20+8]= 69;        daa[17*20+9]= 42;        daa[17*20+10]= 159;      daa[17*20+11]= 10;       
+            daa[17*20+12]= 86;       daa[17*20+13]= 468;      daa[17*20+14]= 49;       daa[17*20+15]= 73;       
+            daa[17*20+16]= 29;       daa[18*20+0]= 56;        daa[18*20+1]= 323;       daa[18*20+2]= 754;       
+            daa[18*20+3]= 281;       daa[18*20+4]= 1466;      daa[18*20+5]= 391;       daa[18*20+6]= 142;       
+            daa[18*20+7]= 10;        daa[18*20+8]= 1971;      daa[18*20+9]= 89;        daa[18*20+10]= 189;      
+            daa[18*20+11]= 247;      daa[18*20+12]= 215;      daa[18*20+13]= 2370;     daa[18*20+14]= 97;       
+            daa[18*20+15]= 522;      daa[18*20+16]= 71;       daa[18*20+17]= 346;      daa[19*20+0]= 968;       
+            daa[19*20+1]= 92;        daa[19*20+2]= 83;        daa[19*20+3]= 75;        daa[19*20+4]= 592;       
+            daa[19*20+5]= 54;        daa[19*20+6]= 200;       daa[19*20+7]= 91;        daa[19*20+8]= 25;        
+            daa[19*20+9]= 4797;      daa[19*20+10]= 865;      daa[19*20+11]= 249;      daa[19*20+12]= 475;      
+            daa[19*20+13]= 317;      daa[19*20+14]= 122;      daa[19*20+15]= 167;      daa[19*20+16]= 760;      
+            daa[19*20+17]= 10;       daa[19*20+18]= 119;      
+            
+            f[0]= 0.076;            f[1]= 0.062;            f[2]= 0.041;            f[3]= 0.037;            
+            f[4]= 0.009;            f[5]= 0.038;            f[6]= 0.049;            f[7]= 0.084;            
+            f[8]= 0.025;            f[9]= 0.081;            f[10]= 0.101;           f[11]= 0.05;            
+            f[12]= 0.022;           f[13]= 0.051;           f[14]= 0.043;           f[15]= 0.062;           
+            f[16]= 0.054;           f[17]= 0.018;           f[18]= 0.031;           f[19]= 0.066; 
+          }
+          break;
+        case PLL_VT:
+          {
+            /*
+              daa[1*20+0]= 0.233108;   daa[2*20+0]= 0.199097;   daa[2*20+1]= 0.210797;   daa[3*20+0]= 0.265145;   
+              daa[3*20+1]= 0.105191;   daa[3*20+2]= 0.883422;   daa[4*20+0]= 0.227333;   daa[4*20+1]= 0.031726;   
+              daa[4*20+2]= 0.027495;   daa[4*20+3]= 0.010313;   daa[5*20+0]= 0.310084;   daa[5*20+1]= 0.493763;   
+              daa[5*20+2]= 0.2757;     daa[5*20+3]= 0.205842;   daa[5*20+4]= 0.004315;   daa[6*20+0]= 0.567957;   
+              daa[6*20+1]= 0.25524;    daa[6*20+2]= 0.270417;   daa[6*20+3]= 1.599461;   daa[6*20+4]= 0.005321;   
+              daa[6*20+5]= 0.960976;   daa[7*20+0]= 0.876213;   daa[7*20+1]= 0.156945;   daa[7*20+2]= 0.362028;   
+              daa[7*20+3]= 0.311718;   daa[7*20+4]= 0.050876;   daa[7*20+5]= 0.12866;    daa[7*20+6]= 0.250447;   
+              daa[8*20+0]= 0.078692;   daa[8*20+1]= 0.213164;   daa[8*20+2]= 0.290006;   daa[8*20+3]= 0.134252;   
+              daa[8*20+4]= 0.016695;   daa[8*20+5]= 0.315521;   daa[8*20+6]= 0.104458;   daa[8*20+7]= 0.058131;   
+              daa[9*20+0]= 0.222972;   daa[9*20+1]= 0.08151;    daa[9*20+2]= 0.087225;   daa[9*20+3]= 0.01172;    
+              daa[9*20+4]= 0.046398;   daa[9*20+5]= 0.054602;   daa[9*20+6]= 0.046589;   daa[9*20+7]= 0.051089;   
+              daa[9*20+8]= 0.020039;   daa[10*20+0]= 0.42463;   daa[10*20+1]= 0.192364;  daa[10*20+2]= 0.069245;  
+              daa[10*20+3]= 0.060863;  daa[10*20+4]= 0.091709;  daa[10*20+5]= 0.24353;   daa[10*20+6]= 0.151924;  
+              daa[10*20+7]= 0.087056;  daa[10*20+8]= 0.103552;  daa[10*20+9]= 2.08989;   daa[11*20+0]= 0.393245;  
+              daa[11*20+1]= 1.755838;  daa[11*20+2]= 0.50306;   daa[11*20+3]= 0.261101;  daa[11*20+4]= 0.004067;  
+              daa[11*20+5]= 0.738208;  daa[11*20+6]= 0.88863;   daa[11*20+7]= 0.193243;  daa[11*20+8]= 0.153323;  
+              daa[11*20+9]= 0.093181;  daa[11*20+10]= 0.201204; daa[12*20+0]= 0.21155;   daa[12*20+1]= 0.08793;   
+              daa[12*20+2]= 0.05742;   daa[12*20+3]= 0.012182;  daa[12*20+4]= 0.02369;   daa[12*20+5]= 0.120801;  
+              daa[12*20+6]= 0.058643;  daa[12*20+7]= 0.04656;   daa[12*20+8]= 0.021157;  daa[12*20+9]= 0.493845;  
+              daa[12*20+10]= 1.105667; daa[12*20+11]= 0.096474; daa[13*20+0]= 0.116646;  daa[13*20+1]= 0.042569;  
+              daa[13*20+2]= 0.039769;  daa[13*20+3]= 0.016577;  daa[13*20+4]= 0.051127;  daa[13*20+5]= 0.026235;  
+              daa[13*20+6]= 0.028168;  daa[13*20+7]= 0.050143;  daa[13*20+8]= 0.079807;  daa[13*20+9]= 0.32102;   
+              daa[13*20+10]= 0.946499; daa[13*20+11]= 0.038261; daa[13*20+12]= 0.173052; daa[14*20+0]= 0.399143;  
+              daa[14*20+1]= 0.12848;   daa[14*20+2]= 0.083956;  daa[14*20+3]= 0.160063;  daa[14*20+4]= 0.011137;  
+              daa[14*20+5]= 0.15657;   daa[14*20+6]= 0.205134;  daa[14*20+7]= 0.124492;  daa[14*20+8]= 0.078892;  
+              daa[14*20+9]= 0.054797;  daa[14*20+10]= 0.169784; daa[14*20+11]= 0.212302; daa[14*20+12]= 0.010363; 
+              daa[14*20+13]= 0.042564; daa[15*20+0]= 1.817198;  daa[15*20+1]= 0.292327;  daa[15*20+2]= 0.847049;  
+              daa[15*20+3]= 0.461519;  daa[15*20+4]= 0.17527;   daa[15*20+5]= 0.358017;  daa[15*20+6]= 0.406035;  
+              daa[15*20+7]= 0.612843;  daa[15*20+8]= 0.167406;  daa[15*20+9]= 0.081567;  daa[15*20+10]= 0.214977; 
+              daa[15*20+11]= 0.400072; daa[15*20+12]= 0.090515; daa[15*20+13]= 0.138119; daa[15*20+14]= 0.430431; 
+              daa[16*20+0]= 0.877877;  daa[16*20+1]= 0.204109;  daa[16*20+2]= 0.471268;  daa[16*20+3]= 0.178197;  
+              daa[16*20+4]= 0.079511;  daa[16*20+5]= 0.248992;  daa[16*20+6]= 0.321028;  daa[16*20+7]= 0.136266;  
+              daa[16*20+8]= 0.101117;  daa[16*20+9]= 0.376588;  daa[16*20+10]= 0.243227; daa[16*20+11]= 0.446646; 
+              daa[16*20+12]= 0.184609; daa[16*20+13]= 0.08587;  daa[16*20+14]= 0.207143; daa[16*20+15]= 1.767766; 
+              daa[17*20+0]= 0.030309;  daa[17*20+1]= 0.046417;  daa[17*20+2]= 0.010459;  daa[17*20+3]= 0.011393;  
+              daa[17*20+4]= 0.007732;  daa[17*20+5]= 0.021248;  daa[17*20+6]= 0.018844;  daa[17*20+7]= 0.02399;   
+              daa[17*20+8]= 0.020009;  daa[17*20+9]= 0.034954;  daa[17*20+10]= 0.083439; daa[17*20+11]= 0.023321; 
+              daa[17*20+12]= 0.022019; daa[17*20+13]= 0.12805;  daa[17*20+14]= 0.014584; daa[17*20+15]= 0.035933; 
+              daa[17*20+16]= 0.020437; daa[18*20+0]= 0.087061;  daa[18*20+1]= 0.09701;   daa[18*20+2]= 0.093268;  
+              daa[18*20+3]= 0.051664;  daa[18*20+4]= 0.042823;  daa[18*20+5]= 0.062544;  daa[18*20+6]= 0.0552;    
+              daa[18*20+7]= 0.037568;  daa[18*20+8]= 0.286027;  daa[18*20+9]= 0.086237;  daa[18*20+10]= 0.189842; 
+              daa[18*20+11]= 0.068689; daa[18*20+12]= 0.073223; daa[18*20+13]= 0.898663; daa[18*20+14]= 0.032043; 
+              daa[18*20+15]= 0.121979; daa[18*20+16]= 0.094617; daa[18*20+17]= 0.124746; daa[19*20+0]= 1.230985;  
+              daa[19*20+1]= 0.113146;  daa[19*20+2]= 0.049824;  daa[19*20+3]= 0.048769;  daa[19*20+4]= 0.163831;  
+              daa[19*20+5]= 0.112027;  daa[19*20+6]= 0.205868;  daa[19*20+7]= 0.082579;  daa[19*20+8]= 0.068575;  
+              daa[19*20+9]= 3.65443;   daa[19*20+10]= 1.337571; daa[19*20+11]= 0.144587; daa[19*20+12]= 0.307309; 
+              daa[19*20+13]= 0.247329; daa[19*20+14]= 0.129315; daa[19*20+15]= 0.1277;   daa[19*20+16]= 0.740372; 
+              daa[19*20+17]= 0.022134; daa[19*20+18]= 0.125733;                     
+              
+              f[0]  = 0.07900;         f[1]= 0.05100;        f[2]  = 0.04200;         f[3]= 0.05300;         
+              f[4]  = 0.01500;         f[5]= 0.03700;        f[6]  = 0.06200;         f[7]= 0.07100;         
+              f[8]  = 0.02300;         f[9]= 0.06200;        f[10] = 0.09600;        f[11]= 0.05700;        
+              f[12] = 0.02400;        f[13]= 0.04300;        f[14] = 0.04400;        f[15]= 0.06400;        
+              f[16] = 0.05600;        f[17]= 0.01300;        f[18] = 0.03500;        f[19]= 0.07300; 
+            */
+
+            daa[1*20+0]=   1.2412691067876198;
+            daa[2*20+0]=   1.2184237953498958;
+            daa[2*20+1]=   1.5720770753326880;
+            daa[3*20+0]=   1.3759368509441177;
+            daa[3*20+1]=   0.7550654439001206;
+            daa[3*20+2]=   7.8584219153689405;
+            daa[4*20+0]=   2.4731223087544874;
+            daa[4*20+1]=   1.4414262567428417;
+            daa[4*20+2]=   0.9784679122774127;
+            daa[4*20+3]=   0.2272488448121475;
+            daa[5*20+0]=   2.2155167805137470;
+            daa[5*20+1]=   5.5120819705248678;
+            daa[5*20+2]=   3.0143201670924822;
+            daa[5*20+3]=   1.6562495638176040;
+            daa[5*20+4]=   0.4587469126746136;
+            daa[6*20+0]=   2.3379911207495061;
+            daa[6*20+1]=   1.3542404860613146;
+            daa[6*20+2]=   2.0093434778398112;
+            daa[6*20+3]=   9.6883451875685065;
+            daa[6*20+4]=   0.4519167943192672;
+            daa[6*20+5]=   6.8124601839937675;
+            daa[7*20+0]=   3.3386555146457697;
+            daa[7*20+1]=   1.3121700301622004;
+            daa[7*20+2]=   2.4117632898861809;
+            daa[7*20+3]=   1.9142079025990228;
+            daa[7*20+4]=   1.1034605684472507;
+            daa[7*20+5]=   0.8776110594765502;
+            daa[7*20+6]=   1.3860121390169038;
+            daa[8*20+0]=   0.9615841926910841;
+            daa[8*20+1]=   4.9238668283945266;
+            daa[8*20+2]=   6.1974384977884114;
+            daa[8*20+3]=   2.1459640610133781;
+            daa[8*20+4]=   1.5196756759380692;
+            daa[8*20+5]=   7.9943228564946525;
+            daa[8*20+6]=   1.6360079688522375;
+            daa[8*20+7]=   0.8561248973045037;
+            daa[9*20+0]=   0.8908203061925510;
+            daa[9*20+1]=   0.4323005487925516;
+            daa[9*20+2]=   0.9179291175331520;
+            daa[9*20+3]=   0.2161660372725585;
+            daa[9*20+4]=   0.9126668032539315;
+            daa[9*20+5]=   0.4882733432879921;
+            daa[9*20+6]=   0.4035497929633328;
+            daa[9*20+7]=   0.2888075033037488;
+            daa[9*20+8]=   0.5787937115407940;
+            daa[10*20+0]=  1.0778497408764076;
+            daa[10*20+1]=  0.8386701149158265;
+            daa[10*20+2]=  0.4098311270816011;
+            daa[10*20+3]=  0.3574207468998517;
+            daa[10*20+4]=  1.4081315998413697;
+            daa[10*20+5]=  1.3318097154194044;
+            daa[10*20+6]=  0.5610717242294755;
+            daa[10*20+7]=  0.3578662395745526;
+            daa[10*20+8]=  1.0765007949562073;
+            daa[10*20+9]=  6.0019110258426362;
+            daa[11*20+0]=  1.4932055816372476;
+            daa[11*20+1]=  10.017330817366002;
+            daa[11*20+2]=  4.4034547578962568;
+            daa[11*20+3]=  1.4521790561663968;
+            daa[11*20+4]=  0.3371091785647479;
+            daa[11*20+5]=  6.0519085243118811;
+            daa[11*20+6]=  4.3290086529582830;
+            daa[11*20+7]=  0.8945563662345198;
+            daa[11*20+8]=  1.8085136096039203;
+            daa[11*20+9]=  0.6244297525127139;
+            daa[11*20+10]= 0.5642322882556321;
+            daa[12*20+0]=  1.9006455961717605;
+            daa[12*20+1]=  1.2488638689609959;
+            daa[12*20+2]=  0.9378803706165143;
+            daa[12*20+3]=  0.4075239926000898;
+            daa[12*20+4]=  1.2213054800811556;
+            daa[12*20+5]=  1.9106190827629084;
+            daa[12*20+6]=  0.7471936218068498;
+            daa[12*20+7]=  0.5954812791740037;
+            daa[12*20+8]=  1.3808291710019667;
+            daa[12*20+9]=  6.7597899772045418;
+            daa[12*20+10]= 8.0327792947421148;
+            daa[12*20+11]= 1.7129670976916258;
+            daa[13*20+0]=  0.6883439026872615;
+            daa[13*20+1]=  0.4224945197276290;
+            daa[13*20+2]=  0.5044944273324311;
+            daa[13*20+3]=  0.1675129724559251;
+            daa[13*20+4]=  1.6953951980808002;
+            daa[13*20+5]=  0.3573432522499545;
+            daa[13*20+6]=  0.2317194387691585;
+            daa[13*20+7]=  0.3693722640980460;
+            daa[13*20+8]=  1.3629765501081097;
+            daa[13*20+9]=  2.2864286949316077;
+            daa[13*20+10]= 4.3611548063555778;
+            daa[13*20+11]= 0.3910559903834828;
+            daa[13*20+12]= 2.3201373546296349;
+            daa[14*20+0]=  2.7355620089953550;
+            daa[14*20+1]=  1.3091837782420783;
+            daa[14*20+2]=  0.7103720531974738;
+            daa[14*20+3]=  1.0714605979577547;
+            daa[14*20+4]=  0.4326227078645523;
+            daa[14*20+5]=  2.3019177728300728;
+            daa[14*20+6]=  1.5132807416252063;
+            daa[14*20+7]=  0.7744933618134962;
+            daa[14*20+8]=  1.8370555852070649;
+            daa[14*20+9]=  0.4811402387911145;
+            daa[14*20+10]= 1.0084320519837335;
+            daa[14*20+11]= 1.3918935593582853;
+            daa[14*20+12]= 0.4953193808676289;
+            daa[14*20+13]= 0.3746821107962129;
+            daa[15*20+0]=  6.4208961859142883;
+            daa[15*20+1]=  1.9202994262316166;
+            daa[15*20+2]=  6.1234512396801764;
+            daa[15*20+3]=  2.2161944596741829;
+            daa[15*20+4]=  3.6366815408744255;
+            daa[15*20+5]=  2.3193703643237220;
+            daa[15*20+6]=  1.8273535587773553;
+            daa[15*20+7]=  3.0637776193717610;
+            daa[15*20+8]=  1.9699895187387506;
+            daa[15*20+9]=  0.6047491507504744;
+            daa[15*20+10]= 0.8953754669269811;
+            daa[15*20+11]= 1.9776630140912268;
+            daa[15*20+12]= 1.0657482318076852;
+            daa[15*20+13]= 1.1079144700606407;
+            daa[15*20+14]= 3.5465914843628927;
+            daa[16*20+0]=  5.2892514169776437;
+            daa[16*20+1]=  1.3363401740560601;
+            daa[16*20+2]=  3.8852506105922231;
+            daa[16*20+3]=  1.5066839872944762;
+            daa[16*20+4]=  1.7557065205837685;
+            daa[16*20+5]=  2.1576510103471440;
+            daa[16*20+6]=  1.5839981708584689;
+            daa[16*20+7]=  0.7147489676267383;
+            daa[16*20+8]=  1.6136654573285647;
+            daa[16*20+9]=  2.6344778384442731;
+            daa[16*20+10]= 1.0192004372506540;
+            daa[16*20+11]= 2.5513781312660280;
+            daa[16*20+12]= 3.3628488360462363;
+            daa[16*20+13]= 0.6882725908872254;
+            daa[16*20+14]= 1.9485376673137556;
+            daa[16*20+15]= 8.8479984061248178;
+            daa[17*20+0]=  0.5488578478106930;
+            daa[17*20+1]=  1.5170142153962840;
+            daa[17*20+2]=  0.1808525752605976;
+            daa[17*20+3]=  0.2496584188151770;
+            daa[17*20+4]=  1.6275179891253113;
+            daa[17*20+5]=  0.8959082681546182;
+            daa[17*20+6]=  0.4198391148111098;
+            daa[17*20+7]=  0.9349753595598769;
+            daa[17*20+8]=  0.6301954684360302;
+            daa[17*20+9]=  0.5604648274060783;
+            daa[17*20+10]= 1.5183114434679339;
+            daa[17*20+11]= 0.5851920879490173;
+            daa[17*20+12]= 1.4680478689711018;
+            daa[17*20+13]= 3.3448437239772266;
+            daa[17*20+14]= 0.4326058001438786;
+            daa[17*20+15]= 0.6791126595939816;
+            daa[17*20+16]= 0.4514203099376473;
+            daa[18*20+0]=  0.5411769916657778;
+            daa[18*20+1]=  0.8912614404565405;
+            daa[18*20+2]=  1.0894926581511342;
+            daa[18*20+3]=  0.7447620891784513;
+            daa[18*20+4]=  2.1579775140421025;
+            daa[18*20+5]=  0.9183596801412757;
+            daa[18*20+6]=  0.5818111331782764;
+            daa[18*20+7]=  0.3374467649724478;
+            daa[18*20+8]=  7.7587442309146040;
+            daa[18*20+9]=  0.8626796044156272;
+            daa[18*20+10]= 1.2452243224541324;
+            daa[18*20+11]= 0.7835447533710449;
+            daa[18*20+12]= 1.0899165770956820;
+            daa[18*20+13]= 10.384852333133459;
+            daa[18*20+14]= 0.4819109019647465;
+            daa[18*20+15]= 0.9547229305958682;
+            daa[18*20+16]= 0.8564314184691215;
+            daa[18*20+17]= 4.5377235790405388;
+            daa[19*20+0]=  4.6501894691803214;
+            daa[19*20+1]=  0.7807017855806767;
+            daa[19*20+2]=  0.4586061981719967;
+            daa[19*20+3]=  0.4594535241660911;
+            daa[19*20+4]=  2.2627456996290891;
+            daa[19*20+5]=  0.6366932501396869;
+            daa[19*20+6]=  0.8940572875547330;
+            daa[19*20+7]=  0.6193321034173915;
+            daa[19*20+8]=  0.5333220944030346;
+            daa[19*20+9]=  14.872933461519061;
+            daa[19*20+10]= 3.5458093276667237;
+            daa[19*20+11]= 0.7801080335991272;
+            daa[19*20+12]= 4.0584577156753401;
+            daa[19*20+13]= 1.7039730522675411;
+            daa[19*20+14]= 0.5985498912985666;
+            daa[19*20+15]= 0.9305232113028208;
+            daa[19*20+16]= 3.4242218450865543;
+            daa[19*20+17]= 0.5658969249032649;
+            daa[19*20+18]= 1.0000000000000000;
+            
+            f[0]=  0.0770764620135024;
+            f[1]=  0.0500819370772208;
+            f[2]=  0.0462377395993731;
+            f[3]=  0.0537929860758246;
+            f[4]=  0.0144533387583345;
+            f[5]=  0.0408923608974345;
+            f[6]=  0.0633579339160905;
+            f[7]=  0.0655672355884439;
+            f[8]=  0.0218802687005936;
+            f[9]=  0.0591969699027449;
+            f[10]= 0.0976461276528445;
+            f[11]= 0.0592079410822730;
+            f[12]= 0.0220695876653368;
+            f[13]= 0.0413508521834260;
+            f[14]= 0.0476871596856874;
+            f[15]= 0.0707295165111524;
+            f[16]= 0.0567759161524817;
+            f[17]= 0.0127019797647213;
+            f[18]= 0.0323746050281867;
+            f[19]= 0.0669190817443274;
+          }
+          break;
+        case PLL_BLOSUM62:
+          {
+            daa[1*20+0]= 0.735790389698;  daa[2*20+0]= 0.485391055466;  daa[2*20+1]= 1.297446705134;  
+            daa[3*20+0]= 0.543161820899;  
+            daa[3*20+1]= 0.500964408555;  daa[3*20+2]= 3.180100048216;  daa[4*20+0]= 1.45999531047;   
+            daa[4*20+1]= 0.227826574209;  
+            daa[4*20+2]= 0.397358949897;  daa[4*20+3]= 0.240836614802;  daa[5*20+0]= 1.199705704602;  
+            daa[5*20+1]= 3.020833610064;  
+            daa[5*20+2]= 1.839216146992;  daa[5*20+3]= 1.190945703396;  daa[5*20+4]= 0.32980150463;   
+            daa[6*20+0]= 1.1709490428;    
+            daa[6*20+1]= 1.36057419042;   daa[6*20+2]= 1.24048850864;   daa[6*20+3]= 3.761625208368;  
+            daa[6*20+4]= 0.140748891814;  
+            daa[6*20+5]= 5.528919177928;  daa[7*20+0]= 1.95588357496;   daa[7*20+1]= 0.418763308518;  
+            daa[7*20+2]= 1.355872344485;  
+            daa[7*20+3]= 0.798473248968;  daa[7*20+4]= 0.418203192284;  daa[7*20+5]= 0.609846305383;  
+            daa[7*20+6]= 0.423579992176;  
+            daa[8*20+0]= 0.716241444998;  daa[8*20+1]= 1.456141166336;  daa[8*20+2]= 2.414501434208;  
+            daa[8*20+3]= 0.778142664022;  
+            daa[8*20+4]= 0.354058109831;  daa[8*20+5]= 2.43534113114;   daa[8*20+6]= 1.626891056982;  
+            daa[8*20+7]= 0.539859124954;  
+            daa[9*20+0]= 0.605899003687;  daa[9*20+1]= 0.232036445142;  daa[9*20+2]= 0.283017326278;  
+            daa[9*20+3]= 0.418555732462;  
+            daa[9*20+4]= 0.774894022794;  daa[9*20+5]= 0.236202451204;  daa[9*20+6]= 0.186848046932;  
+            daa[9*20+7]= 0.189296292376;  
+            daa[9*20+8]= 0.252718447885;  daa[10*20+0]= 0.800016530518; daa[10*20+1]= 0.622711669692; 
+            daa[10*20+2]= 0.211888159615; 
+            daa[10*20+3]= 0.218131577594; daa[10*20+4]= 0.831842640142; daa[10*20+5]= 0.580737093181; 
+            daa[10*20+6]= 0.372625175087; 
+            daa[10*20+7]= 0.217721159236; daa[10*20+8]= 0.348072209797; daa[10*20+9]= 3.890963773304; 
+            daa[11*20+0]= 1.295201266783; 
+            daa[11*20+1]= 5.411115141489; daa[11*20+2]= 1.593137043457; daa[11*20+3]= 1.032447924952; 
+            daa[11*20+4]= 0.285078800906; 
+            daa[11*20+5]= 3.945277674515; daa[11*20+6]= 2.802427151679; daa[11*20+7]= 0.752042440303; 
+            daa[11*20+8]= 1.022507035889; 
+            daa[11*20+9]= 0.406193586642; daa[11*20+10]= 0.445570274261;daa[12*20+0]= 1.253758266664; 
+            daa[12*20+1]= 0.983692987457; 
+            daa[12*20+2]= 0.648441278787; daa[12*20+3]= 0.222621897958; daa[12*20+4]= 0.76768882348;  
+            daa[12*20+5]= 2.494896077113; 
+            daa[12*20+6]= 0.55541539747;  daa[12*20+7]= 0.459436173579; daa[12*20+8]= 0.984311525359; 
+            daa[12*20+9]= 3.364797763104; 
+            daa[12*20+10]= 6.030559379572;daa[12*20+11]= 1.073061184332;daa[13*20+0]= 0.492964679748; 
+            daa[13*20+1]= 0.371644693209; 
+            daa[13*20+2]= 0.354861249223; daa[13*20+3]= 0.281730694207; daa[13*20+4]= 0.441337471187; 
+            daa[13*20+5]= 0.14435695975;  
+            daa[13*20+6]= 0.291409084165; daa[13*20+7]= 0.368166464453; daa[13*20+8]= 0.714533703928; 
+            daa[13*20+9]= 1.517359325954; 
+            daa[13*20+10]= 2.064839703237;daa[13*20+11]= 0.266924750511;daa[13*20+12]= 1.77385516883; 
+            daa[14*20+0]= 1.173275900924; 
+            daa[14*20+1]= 0.448133661718; daa[14*20+2]= 0.494887043702; daa[14*20+3]= 0.730628272998; 
+            daa[14*20+4]= 0.356008498769; 
+            daa[14*20+5]= 0.858570575674; daa[14*20+6]= 0.926563934846; daa[14*20+7]= 0.504086599527; daa[14*20+8]= 0.527007339151; 
+            daa[14*20+9]= 0.388355409206; daa[14*20+10]= 0.374555687471;daa[14*20+11]= 1.047383450722;daa[14*20+12]= 0.454123625103;
+            daa[14*20+13]= 0.233597909629;daa[15*20+0]= 4.325092687057; daa[15*20+1]= 1.12278310421;  daa[15*20+2]= 2.904101656456; 
+            daa[15*20+3]= 1.582754142065; daa[15*20+4]= 1.197188415094; daa[15*20+5]= 1.934870924596; daa[15*20+6]= 1.769893238937; 
+            daa[15*20+7]= 1.509326253224; daa[15*20+8]= 1.11702976291;  daa[15*20+9]= 0.35754441246;  daa[15*20+10]= 0.352969184527;
+            daa[15*20+11]= 1.752165917819;daa[15*20+12]= 0.918723415746;daa[15*20+13]= 0.540027644824;daa[15*20+14]= 1.169129577716;
+            daa[16*20+0]= 1.729178019485; daa[16*20+1]= 0.914665954563; daa[16*20+2]= 1.898173634533; daa[16*20+3]= 0.934187509431; 
+            daa[16*20+4]= 1.119831358516; daa[16*20+5]= 1.277480294596; daa[16*20+6]= 1.071097236007; daa[16*20+7]= 0.641436011405; 
+            daa[16*20+8]= 0.585407090225; daa[16*20+9]= 1.17909119726;  daa[16*20+10]= 0.915259857694;daa[16*20+11]= 1.303875200799;
+            daa[16*20+12]= 1.488548053722;daa[16*20+13]= 0.488206118793;daa[16*20+14]= 1.005451683149;daa[16*20+15]= 5.15155629227; 
+            daa[17*20+0]= 0.465839367725; daa[17*20+1]= 0.426382310122; daa[17*20+2]= 0.191482046247; daa[17*20+3]= 0.145345046279; 
+            daa[17*20+4]= 0.527664418872; daa[17*20+5]= 0.758653808642; daa[17*20+6]= 0.407635648938; daa[17*20+7]= 0.508358924638; 
+            daa[17*20+8]= 0.30124860078;  daa[17*20+9]= 0.34198578754;  daa[17*20+10]= 0.6914746346;  daa[17*20+11]= 0.332243040634;
+            daa[17*20+12]= 0.888101098152;daa[17*20+13]= 2.074324893497;daa[17*20+14]= 0.252214830027;daa[17*20+15]= 0.387925622098;
+            daa[17*20+16]= 0.513128126891;daa[18*20+0]= 0.718206697586; daa[18*20+1]= 0.720517441216; daa[18*20+2]= 0.538222519037; 
+            daa[18*20+3]= 0.261422208965; daa[18*20+4]= 0.470237733696; daa[18*20+5]= 0.95898974285;  daa[18*20+6]= 0.596719300346; 
+            daa[18*20+7]= 0.308055737035; daa[18*20+8]= 4.218953969389; daa[18*20+9]= 0.674617093228; daa[18*20+10]= 0.811245856323;
+            daa[18*20+11]= 0.7179934869;  daa[18*20+12]= 0.951682162246;daa[18*20+13]= 6.747260430801;daa[18*20+14]= 0.369405319355;
+            daa[18*20+15]= 0.796751520761;daa[18*20+16]= 0.801010243199;daa[18*20+17]= 4.054419006558;daa[19*20+0]= 2.187774522005; 
+            daa[19*20+1]= 0.438388343772; daa[19*20+2]= 0.312858797993; daa[19*20+3]= 0.258129289418; daa[19*20+4]= 1.116352478606; 
+            daa[19*20+5]= 0.530785790125; daa[19*20+6]= 0.524253846338; daa[19*20+7]= 0.25334079019;  daa[19*20+8]= 0.20155597175;  
+            daa[19*20+9]= 8.311839405458; daa[19*20+10]= 2.231405688913;daa[19*20+11]= 0.498138475304;daa[19*20+12]= 2.575850755315;
+            daa[19*20+13]= 0.838119610178;daa[19*20+14]= 0.496908410676;daa[19*20+15]= 0.561925457442;daa[19*20+16]= 2.253074051176;
+            daa[19*20+17]= 0.266508731426;daa[19*20+18]= 1;             
+            
+            f[0]= 0.074;                 f[1]= 0.052;                 f[2]= 0.045;                 f[3]= 0.054;                 
+            f[4]= 0.025;                 f[5]= 0.034;                 f[6]= 0.054;                 f[7]= 0.074;                 
+            f[8]= 0.026;                 f[9]= 0.068;                 f[10]= 0.099;                f[11]= 0.058;                
+            f[12]= 0.025;                f[13]= 0.047;                f[14]= 0.039;                f[15]= 0.057;                
+            f[16]= 0.051;                f[17]= 0.013;                f[18]= 0.032;                f[19]= 0.073;
+          }
+          break;
+        case PLL_MTMAM:
+          {
+            daa[1*20+0]= 32;              daa[2*20+0]= 2;    daa[2*20+1]= 4;               daa[3*20+0]= 11;
+            daa[3*20+1]= 0;               daa[3*20+2]= 864;  daa[4*20+0]= 0;               daa[4*20+1]= 186;
+            daa[4*20+2]= 0;               daa[4*20+3]= 0;    daa[5*20+0]= 0;               daa[5*20+1]= 246;
+            daa[5*20+2]= 8;               daa[5*20+3]= 49;   daa[5*20+4]= 0;               daa[6*20+0]= 0;
+            daa[6*20+1]= 0;               daa[6*20+2]= 0;    daa[6*20+3]= 569;             daa[6*20+4]= 0;
+            daa[6*20+5]= 274;             daa[7*20+0]= 78;   daa[7*20+1]= 18;              daa[7*20+2]= 47;
+            daa[7*20+3]= 79;              daa[7*20+4]= 0;    daa[7*20+5]= 0;               daa[7*20+6]= 22;
+            daa[8*20+0]= 8;               daa[8*20+1]= 232;  daa[8*20+2]= 458;             daa[8*20+3]= 11;
+            daa[8*20+4]= 305;             daa[8*20+5]= 550;  daa[8*20+6]= 22;              daa[8*20+7]= 0;
+            daa[9*20+0]= 75;              daa[9*20+1]= 0;    daa[9*20+2]= 19;              daa[9*20+3]= 0;
+            daa[9*20+4]= 41;              daa[9*20+5]= 0;    daa[9*20+6]= 0;               daa[9*20+7]= 0;
+            daa[9*20+8]= 0;               daa[10*20+0]= 21;  daa[10*20+1]= 6;              daa[10*20+2]= 0;
+            daa[10*20+3]= 0;              daa[10*20+4]= 27;  daa[10*20+5]= 20;             daa[10*20+6]= 0;
+            daa[10*20+7]= 0;              daa[10*20+8]= 26;  daa[10*20+9]= 232;            daa[11*20+0]= 0;
+            daa[11*20+1]= 50;             daa[11*20+2]= 408; daa[11*20+3]= 0;              daa[11*20+4]= 0;
+            daa[11*20+5]= 242;            daa[11*20+6]= 215; daa[11*20+7]= 0;              daa[11*20+8]= 0;
+            daa[11*20+9]= 6;              daa[11*20+10]= 4;  daa[12*20+0]= 76;             daa[12*20+1]= 0;
+            daa[12*20+2]= 21;             daa[12*20+3]= 0;   daa[12*20+4]= 0;              daa[12*20+5]= 22;
+            daa[12*20+6]= 0;              daa[12*20+7]= 0;   daa[12*20+8]= 0;              daa[12*20+9]= 378;
+            daa[12*20+10]= 609;           daa[12*20+11]= 59; daa[13*20+0]= 0;              daa[13*20+1]= 0;
+            daa[13*20+2]= 6;              daa[13*20+3]= 5;   daa[13*20+4]= 7;              daa[13*20+5]= 0;
+            daa[13*20+6]= 0;              daa[13*20+7]= 0;   daa[13*20+8]= 0;              daa[13*20+9]= 57;
+            daa[13*20+10]= 246;           daa[13*20+11]= 0;  daa[13*20+12]= 11;            daa[14*20+0]= 53;
+            daa[14*20+1]= 9;              daa[14*20+2]= 33;  daa[14*20+3]= 2;              daa[14*20+4]= 0;
+            daa[14*20+5]= 51;             daa[14*20+6]= 0;   daa[14*20+7]= 0;              daa[14*20+8]= 53;
+            daa[14*20+9]= 5;              daa[14*20+10]= 43; daa[14*20+11]= 18;            daa[14*20+12]= 0;
+            daa[14*20+13]= 17;            daa[15*20+0]= 342; daa[15*20+1]= 3;              daa[15*20+2]= 446;
+            daa[15*20+3]= 16;             daa[15*20+4]= 347; daa[15*20+5]= 30;             daa[15*20+6]= 21;
+            daa[15*20+7]= 112;            daa[15*20+8]= 20;  daa[15*20+9]= 0;              daa[15*20+10]= 74;
+            daa[15*20+11]= 65;            daa[15*20+12]= 47; daa[15*20+13]= 90;            daa[15*20+14]= 202;
+            daa[16*20+0]= 681;            daa[16*20+1]= 0;   daa[16*20+2]= 110;            daa[16*20+3]= 0;
+            daa[16*20+4]= 114;            daa[16*20+5]= 0;   daa[16*20+6]= 4;              daa[16*20+7]= 0;
+            daa[16*20+8]= 1;              daa[16*20+9]= 360; daa[16*20+10]= 34;            daa[16*20+11]= 50;
+            daa[16*20+12]= 691;           daa[16*20+13]= 8;  daa[16*20+14]= 78;            daa[16*20+15]= 614;
+            daa[17*20+0]= 5;              daa[17*20+1]= 16;  daa[17*20+2]= 6;              daa[17*20+3]= 0;
+            daa[17*20+4]= 65;             daa[17*20+5]= 0;   daa[17*20+6]= 0;              daa[17*20+7]= 0;
+            daa[17*20+8]= 0;              daa[17*20+9]= 0;   daa[17*20+10]= 12;            daa[17*20+11]= 0;
+            daa[17*20+12]= 13;            daa[17*20+13]= 0;  daa[17*20+14]= 7;             daa[17*20+15]= 17;
+            daa[17*20+16]= 0;             daa[18*20+0]= 0;   daa[18*20+1]= 0;              daa[18*20+2]= 156;
+            daa[18*20+3]= 0;              daa[18*20+4]= 530; daa[18*20+5]= 54;             daa[18*20+6]= 0;
+            daa[18*20+7]= 1;              daa[18*20+8]= 1525;daa[18*20+9]= 16;             daa[18*20+10]= 25;
+            daa[18*20+11]= 67;            daa[18*20+12]= 0;  daa[18*20+13]= 682;           daa[18*20+14]= 8;
+            daa[18*20+15]= 107;           daa[18*20+16]= 0;  daa[18*20+17]= 14;            daa[19*20+0]= 398;
+            daa[19*20+1]= 0;              daa[19*20+2]= 0;   daa[19*20+3]= 10;             daa[19*20+4]= 0;
+            daa[19*20+5]= 33;             daa[19*20+6]= 20;  daa[19*20+7]= 5;              daa[19*20+8]= 0;
+            daa[19*20+9]= 2220;           daa[19*20+10]= 100;daa[19*20+11]= 0;             daa[19*20+12]= 832;
+            daa[19*20+13]= 6;             daa[19*20+14]= 0;  daa[19*20+15]= 0;             daa[19*20+16]= 237;
+            daa[19*20+17]= 0;             daa[19*20+18]= 0;       
+            
+            f[0]= 0.06920;  f[1]=  0.01840;  f[2]= 0.04000;  f[3]= 0.018600;
+            f[4]= 0.00650;  f[5]=  0.02380;  f[6]= 0.02360;  f[7]= 0.055700;
+            f[8]= 0.02770;  f[9]=  0.09050;  f[10]=0.16750;  f[11]= 0.02210;
+            f[12]=0.05610;  f[13]= 0.06110;  f[14]=0.05360;  f[15]= 0.07250;
+            f[16]=0.08700;  f[17]= 0.02930;  f[18]=0.03400;  f[19]= 0.04280;
+          }
+          break;
+        case PLL_LG:
+          {
+            daa[1*20+0] = 0.425093;
+
+            daa[2*20+0] = 0.276818; daa[2*20+1] = 0.751878;
+
+            daa[3*20+0] = 0.395144; daa[3*20+1] = 0.123954; daa[3*20+2] = 5.076149;
+            
+            daa[4*20+0] = 2.489084; daa[4*20+1] = 0.534551; daa[4*20+2] = 0.528768; daa[4*20+3] = 0.062556;
+                                                                 
+            daa[5*20+0] = 0.969894; daa[5*20+1] = 2.807908; daa[5*20+2] = 1.695752; daa[5*20+3] = 0.523386; daa[5*20+4] = 0.084808;
+
+            daa[6*20+0] = 1.038545; daa[6*20+1] = 0.363970; daa[6*20+2] = 0.541712; daa[6*20+3] = 5.243870; daa[6*20+4] = 0.003499; daa[6*20+5] = 4.128591;
+
+            daa[7*20+0] = 2.066040; daa[7*20+1] = 0.390192; daa[7*20+2] = 1.437645; daa[7*20+3] = 0.844926; daa[7*20+4] = 0.569265; daa[7*20+5] = 0.267959; daa[7*20+6] = 0.348847;
+ 
+            daa[8*20+0] = 0.358858; daa[8*20+1] = 2.426601; daa[8*20+2] = 4.509238; daa[8*20+3] = 0.927114; daa[8*20+4] = 0.640543; daa[8*20+5] = 4.813505; daa[8*20+6] = 0.423881; 
+            daa[8*20+7] = 0.311484;
+
+            daa[9*20+0] = 0.149830; daa[9*20+1] = 0.126991; daa[9*20+2] = 0.191503; daa[9*20+3] = 0.010690; daa[9*20+4] = 0.320627; daa[9*20+5] = 0.072854; daa[9*20+6] = 0.044265; 
+            daa[9*20+7] = 0.008705; daa[9*20+8] = 0.108882; 
+
+            daa[10*20+0] = 0.395337; daa[10*20+1] = 0.301848; daa[10*20+2] = 0.068427; daa[10*20+3] = 0.015076; daa[10*20+4] = 0.594007; daa[10*20+5] = 0.582457; daa[10*20+6] = 0.069673; 
+            daa[10*20+7] = 0.044261; daa[10*20+8] = 0.366317; daa[10*20+9] = 4.145067 ;
+
+            daa[11*20+0] = 0.536518; daa[11*20+1] = 6.326067; daa[11*20+2] = 2.145078; daa[11*20+3] = 0.282959; daa[11*20+4] = 0.013266; daa[11*20+5] = 3.234294; daa[11*20+6] = 1.807177; 
+            daa[11*20+7] = 0.296636; daa[11*20+8] = 0.697264; daa[11*20+9] = 0.159069; daa[11*20+10] = 0.137500;
+
+
+            daa[12*20+0] = 1.124035; daa[12*20+1] = 0.484133; daa[12*20+2] = 0.371004; daa[12*20+3] = 0.025548; daa[12*20+4] = 0.893680; daa[12*20+5] = 1.672569; daa[12*20+6] = 0.173735; 
+            daa[12*20+7] = 0.139538; daa[12*20+8] = 0.442472; daa[12*20+9] = 4.273607; daa[12*20+10] = 6.312358; daa[12*20+11] = 0.656604;
+
+            daa[13*20+0] = 0.253701; daa[13*20+1] = 0.052722;daa[13*20+2] = 0.089525; daa[13*20+3] = 0.017416; daa[13*20+4] = 1.105251; daa[13*20+5] = 0.035855; daa[13*20+6] = 0.018811; 
+            daa[13*20+7] = 0.089586; daa[13*20+8] = 0.682139; daa[13*20+9] = 1.112727; daa[13*20+10] = 2.592692; daa[13*20+11] = 0.023918; daa[13*20+12] = 1.798853;
+
+            daa[14*20+0] = 1.177651; daa[14*20+1] = 0.332533;daa[14*20+2] = 0.161787; daa[14*20+3] = 0.394456; daa[14*20+4] = 0.075382; daa[14*20+5] = 0.624294; daa[14*20+6] = 0.419409; 
+            daa[14*20+7] = 0.196961; daa[14*20+8] = 0.508851; daa[14*20+9] = 0.078281; daa[14*20+10] = 0.249060; daa[14*20+11] = 0.390322; daa[14*20+12] = 0.099849; 
+            daa[14*20+13] = 0.094464;
+ 
+            daa[15*20+0] = 4.727182; daa[15*20+1] = 0.858151;daa[15*20+2] = 4.008358; daa[15*20+3] = 1.240275; daa[15*20+4] = 2.784478; daa[15*20+5] = 1.223828; daa[15*20+6] = 0.611973; 
+            daa[15*20+7] = 1.739990; daa[15*20+8] = 0.990012; daa[15*20+9] = 0.064105; daa[15*20+10] = 0.182287; daa[15*20+11] = 0.748683; daa[15*20+12] = 0.346960; 
+            daa[15*20+13] = 0.361819; daa[15*20+14] = 1.338132;
+ 
+            daa[16*20+0] = 2.139501; daa[16*20+1] = 0.578987;daa[16*20+2] = 2.000679; daa[16*20+3] = 0.425860; daa[16*20+4] = 1.143480; daa[16*20+5] = 1.080136; daa[16*20+6] = 0.604545; 
+            daa[16*20+7] = 0.129836; daa[16*20+8] = 0.584262; daa[16*20+9] = 1.033739; daa[16*20+10] = 0.302936; daa[16*20+11] = 1.136863; daa[16*20+12] = 2.020366; 
+            daa[16*20+13] = 0.165001; daa[16*20+14] = 0.571468; daa[16*20+15] = 6.472279;
+
+            daa[17*20+0] = 0.180717; daa[17*20+1] = 0.593607;daa[17*20+2] = 0.045376; daa[17*20+3] = 0.029890; daa[17*20+4] = 0.670128; daa[17*20+5] = 0.236199; daa[17*20+6] = 0.077852; 
+            daa[17*20+7] = 0.268491; daa[17*20+8] = 0.597054; daa[17*20+9] = 0.111660; daa[17*20+10] = 0.619632; daa[17*20+11] = 0.049906; daa[17*20+12] = 0.696175; 
+            daa[17*20+13] = 2.457121; daa[17*20+14] = 0.095131; daa[17*20+15] = 0.248862; daa[17*20+16] = 0.140825;
+
+            daa[18*20+0] = 0.218959; daa[18*20+1] = 0.314440;daa[18*20+2] = 0.612025; daa[18*20+3] = 0.135107; daa[18*20+4] = 1.165532; daa[18*20+5] = 0.257336; daa[18*20+6] = 0.120037; 
+            daa[18*20+7] = 0.054679; daa[18*20+8] = 5.306834; daa[18*20+9] = 0.232523; daa[18*20+10] = 0.299648; daa[18*20+11] = 0.131932; daa[18*20+12] = 0.481306; 
+            daa[18*20+13] = 7.803902; daa[18*20+14] = 0.089613; daa[18*20+15] = 0.400547; daa[18*20+16] = 0.245841; daa[18*20+17] = 3.151815;
+
+            daa[19*20+0] = 2.547870; daa[19*20+1] = 0.170887;daa[19*20+2] = 0.083688; daa[19*20+3] = 0.037967; daa[19*20+4] = 1.959291; daa[19*20+5] = 0.210332; daa[19*20+6] = 0.245034; 
+            daa[19*20+7] = 0.076701; daa[19*20+8] = 0.119013; daa[19*20+9] = 10.649107; daa[19*20+10] = 1.702745; daa[19*20+11] = 0.185202; daa[19*20+12] = 1.898718; 
+            daa[19*20+13] = 0.654683; daa[19*20+14] = 0.296501; daa[19*20+15] = 0.098369; daa[19*20+16] = 2.188158; daa[19*20+17] = 0.189510; daa[19*20+18] = 0.249313;
+            
+            f[0]  = 0.079066; f[1]  = 0.055941; f[2]  = 0.041977; f[3]  = 0.053052;
+	    f[4]  = 0.012937; f[5]  = 0.040767; f[6]  = 0.071586; f[7]  = 0.057337;
+	    f[8]  = 0.022355; f[9]  = 0.062157; f[10] = 0.099081; f[11] = 0.064600;
+	    f[12] = 0.022951; f[13] = 0.042302; f[14] = 0.044040; f[15] = 0.061197;
+	    f[16] = 0.053287; f[17] = 0.012066; f[18] = 0.034155; f[19] = 0.069146;       
+          }       
+          break;
+          case PLL_LG4M:
+          {
+            double 
+              rates[4][190] = 
+              {
+                {
+                  0.269343
+                  , 0.254612, 0.150988
+                  , 0.236821, 0.031863, 0.659648
+                  , 2.506547, 0.938594, 0.975736, 0.175533
+                  , 0.359080, 0.348288, 0.697708, 0.086573, 0.095967
+                  , 0.304674, 0.156000, 0.377704, 0.449140, 0.064706, 4.342595
+                  , 1.692015, 0.286638, 0.565095, 0.380358, 0.617945, 0.202058, 0.264342
+                  , 0.251974, 0.921633, 1.267609, 0.309692, 0.390429, 2.344059, 0.217750, 0.104842
+                  , 1.085220, 0.325624, 0.818658, 0.037814, 1.144150, 0.534567, 0.222793, 0.062682, 0.567431
+                  , 0.676353, 0.602366, 0.217027, 0.007533, 1.595775, 0.671143, 0.158424, 0.070463, 0.764255, 8.226528
+                  , 0.179155, 0.971338, 1.343718, 0.133744, 0.122468, 0.983857, 0.994128, 0.220916, 0.410581, 0.387487, 0.181110
+                  , 1.636817, 0.515217, 0.670461, 0.071252, 1.534848, 5.288642, 0.255628, 0.094198, 0.257229, 25.667158, 6.819689, 1.591212
+                  , 0.235498, 0.123932, 0.099793, 0.030425, 0.897279, 0.112229, 0.022529, 0.047488, 0.762914, 1.344259, 0.865691, 0.038921, 2.030833
+                  , 1.265605, 0.040163, 0.173354, 0.027579, 0.259961, 0.580374, 0.088041, 0.145595, 0.143676, 0.298859, 1.020117, 0.000714, 0.190019, 0.093964
+                  , 5.368405, 0.470952, 5.267140, 0.780505, 4.986071, 0.890554, 0.377949, 1.755515, 0.786352, 0.527246, 0.667783, 0.659948, 0.731921, 0.837669, 1.355630
+                  , 1.539394, 0.326789, 1.688169, 0.283738, 1.389282, 0.329821, 0.231770, 0.117017, 0.449977, 3.531600, 0.721586, 0.497588, 2.691697, 0.152088, 0.698040, 16.321298
+                  , 0.140944, 0.375611, 0.025163, 0.002757, 0.801456, 0.257253, 0.103678, 0.132995, 0.345834, 0.377156, 0.839647, 0.176970, 0.505682, 1.670170, 0.091298, 0.210096, 0.013165
+                  , 0.199836, 0.146857, 0.806275, 0.234246, 1.436970, 0.319669, 0.010076, 0.036859, 3.503317, 0.598632, 0.738969, 0.154436, 0.579000, 4.245524, 0.074524, 0.454195, 0.232913, 1.178490
+                  , 9.435529, 0.285934, 0.395670, 0.130890, 6.097263, 0.516259, 0.503665, 0.222960, 0.149143, 13.666175, 2.988174, 0.162725, 5.973826, 0.843416, 0.597394, 0.701149, 4.680002, 0.300085, 0.416262
+                },
+                {
+                  0.133720
+                  , 0.337212, 0.749052
+                  , 0.110918, 0.105087, 4.773487
+                  , 3.993460, 0.188305, 1.590332, 0.304942
+                  , 0.412075, 2.585774, 1.906884, 0.438367, 0.242076
+                  , 0.435295, 0.198278, 0.296366, 7.470333, 0.008443, 3.295515
+                  , 7.837540, 0.164607, 0.431724, 0.153850, 1.799716, 0.269744, 0.242866
+                  , 0.203872, 2.130334, 9.374479, 1.080878, 0.152458, 12.299133, 0.279589, 0.089714
+                  , 0.039718, 0.024553, 0.135254, 0.014979, 0.147498, 0.033964, 0.005585, 0.007248, 0.022746
+                  , 0.075784, 0.080091, 0.084971, 0.014128, 0.308347, 0.500836, 0.022833, 0.022999, 0.161270, 1.511682
+                  , 0.177662, 10.373708, 1.036721, 0.038303, 0.043030, 2.181033, 0.321165, 0.103050, 0.459502, 0.021215, 0.078395
+                  , 0.420784, 0.192765, 0.329545, 0.008331, 0.883142, 1.403324, 0.168673, 0.160728, 0.612573, 1.520889, 7.763266, 0.307903
+                  , 0.071268, 0.019652, 0.088753, 0.013547, 0.566609, 0.071878, 0.020050, 0.041022, 0.625361, 0.382806, 1.763059, 0.044644, 1.551911
+                  , 0.959127, 1.496585, 0.377794, 0.332010, 0.318192, 1.386970, 0.915904, 0.224255, 2.611479, 0.029351, 0.068250, 1.542356, 0.047525, 0.182715
+                  , 11.721512, 0.359408, 2.399158, 0.219464, 9.104192, 0.767563, 0.235229, 3.621219, 0.971955, 0.033780, 0.043035, 0.236929, 0.319964, 0.124977, 0.840651
+                  , 2.847068, 0.218463, 1.855386, 0.109808, 4.347048, 0.765848, 0.164569, 0.312024, 0.231569, 0.356327, 0.159597, 0.403210, 1.135162, 0.106903, 0.269190, 9.816481
+                  , 0.030203, 0.387292, 0.118878, 0.067287, 0.190240, 0.122113, 0.007023, 0.137411, 0.585141, 0.020634, 0.228824, 0.000122, 0.474862, 3.135128, 0.030313, 0.093830, 0.119152
+                  , 0.067183, 0.130101, 0.348730, 0.061798, 0.301198, 0.095382, 0.095764, 0.044628, 2.107384, 0.046105, 0.100117, 0.017073, 0.192383, 8.367641, 0.000937, 0.137416, 0.044722, 4.179782
+                  , 0.679398, 0.041567, 0.092408, 0.023701, 1.271187, 0.115566, 0.055277, 0.086988, 0.060779, 8.235167, 0.609420, 0.061764, 0.581962, 0.184187, 0.080246, 0.098033, 1.438350, 0.023439, 0.039124
+                },          
+                {
+                  0.421017
+                  , 0.316236, 0.693340
+                  , 0.285984, 0.059926, 6.158219
+                  , 4.034031, 1.357707, 0.708088, 0.063669
+                  , 0.886972, 2.791622, 1.701830, 0.484347, 0.414286
+                  , 0.760525, 0.233051, 0.378723, 4.032667, 0.081977, 4.940411
+                  , 0.754103, 0.402894, 2.227443, 1.102689, 0.416576, 0.459376, 0.508409
+                  , 0.571422, 2.319453, 5.579973, 0.885376, 1.439275, 4.101979, 0.576745, 0.428799
+                  , 0.162152, 0.085229, 0.095692, 0.006129, 0.490937, 0.104843, 0.045514, 0.004705, 0.098934
+                  , 0.308006, 0.287051, 0.056994, 0.007102, 0.958988, 0.578990, 0.067119, 0.024403, 0.342983, 3.805528
+                  , 0.390161, 7.663209, 1.663641, 0.105129, 0.135029, 3.364474, 0.652618, 0.457702, 0.823674, 0.129858, 0.145630
+                  , 1.042298, 0.364551, 0.293222, 0.037983, 1.486520, 1.681752, 0.192414, 0.070498, 0.222626, 4.529623, 4.781730, 0.665308
+                  , 0.362476, 0.073439, 0.129245, 0.020078, 1.992483, 0.114549, 0.023272, 0.064490, 1.491794, 1.113437, 2.132006, 0.041677, 1.928654
+                  , 1.755491, 0.087050, 0.099325, 0.163817, 0.242851, 0.322939, 0.062943, 0.198698, 0.192904, 0.062948, 0.180283, 0.059655, 0.129323, 0.065778
+                  , 3.975060, 0.893398, 5.496314, 1.397313, 3.575120, 1.385297, 0.576191, 1.733288, 1.021255, 0.065131, 0.129115, 0.600308, 0.387276, 0.446001, 1.298493
+                  , 2.565079, 0.534056, 2.143993, 0.411388, 2.279084, 0.893006, 0.528209, 0.135731, 0.518741, 0.972662, 0.280700, 0.890086, 1.828755, 0.189028, 0.563778, 7.788147
+                  , 0.283631, 0.497926, 0.075454, 0.043794, 1.335322, 0.308605, 0.140137, 0.150797, 1.409726, 0.119868, 0.818331, 0.080591, 1.066017, 3.754687, 0.073415, 0.435046, 0.197272
+                  , 0.242513, 0.199157, 0.472207, 0.085937, 2.039787, 0.262751, 0.084578, 0.032247, 7.762326, 0.153966, 0.299828, 0.117255, 0.438215, 14.506235, 0.089180, 0.352766, 0.215417, 5.054245
+                  , 2.795818, 0.107130, 0.060909, 0.029724, 2.986426, 0.197267, 0.196977, 0.044327, 0.116751, 7.144311, 1.848622, 0.118020, 1.999696, 0.705747, 0.272763, 0.096935, 1.820982, 0.217007, 0.172975
+                },
+                {
+                  0.576160
+                  , 0.567606, 0.498643
+                  , 0.824359, 0.050698, 3.301401
+                  , 0.822724, 4.529235, 1.291808, 0.101930
+                  , 1.254238, 2.169809, 1.427980, 0.449474, 0.868679
+                  , 1.218615, 0.154502, 0.411471, 3.172277, 0.050239, 2.138661
+                  , 1.803443, 0.604673, 2.125496, 1.276384, 1.598679, 0.502653, 0.479490
+                  , 0.516862, 2.874265, 4.845769, 0.719673, 3.825677, 4.040275, 0.292773, 0.596643
+                  , 0.180898, 0.444586, 0.550969, 0.023542, 2.349573, 0.370160, 0.142187, 0.016618, 0.500788
+                  , 0.452099, 0.866322, 0.201033, 0.026731, 2.813990, 1.645178, 0.135556, 0.072152, 1.168817, 5.696116
+                  , 0.664186, 2.902886, 2.101971, 0.127988, 0.200218, 2.505933, 0.759509, 0.333569, 0.623100, 0.547454, 0.363656
+                  , 0.864415, 0.835049, 0.632649, 0.079201, 2.105931, 1.633544, 0.216462, 0.252419, 0.665406, 7.994105, 11.751178, 1.096842
+                  , 0.324478, 0.208947, 0.280339, 0.041683, 4.788477, 0.107022, 0.067711, 0.171320, 3.324779, 2.965328, 5.133843, 0.084856, 4.042591
+                  , 1.073043, 0.173826, 0.041985, 0.270336, 0.121299, 0.351384, 0.228565, 0.225318, 0.376089, 0.058027, 0.390354, 0.214230, 0.058954, 0.126299
+                  , 3.837562, 0.884342, 4.571911, 0.942751, 6.592827, 1.080063, 0.465397, 3.137614, 1.119667, 0.362516, 0.602355, 0.716940, 0.506796, 1.444484, 1.432558
+                  , 2.106026, 0.750016, 2.323325, 0.335915, 1.654673, 1.194017, 0.617231, 0.318671, 0.801030, 4.455842, 0.580191, 1.384210, 3.522468, 0.473128, 0.432718, 5.716300
+                  , 0.163720, 0.818102, 0.072322, 0.068275, 3.305436, 0.373790, 0.054323, 0.476587, 1.100360, 0.392946, 1.703323, 0.085720, 1.725516, 5.436253, 0.053108, 0.498594, 0.231832
+                  , 0.241167, 0.302440, 1.055095, 0.246940, 9.741942, 0.249895, 0.129973, 0.052363, 11.542498, 1.047449, 1.319667, 0.139770, 1.330225, 26.562270, 0.046986, 0.737653, 0.313460, 5.165098
+                  , 1.824586, 0.435795, 0.179086, 0.091739, 3.609570, 0.649507, 0.656681, 0.225234, 0.473437, 19.897252, 3.001995, 0.452926, 3.929598, 1.692159, 0.370204, 0.373501, 3.329822, 0.326593, 0.860743
+                }
+              };
+            
+            double
+              freqs[4][20] = 
+              {{0.082276,0.055172,0.043853,0.053484,0.018957,0.028152,0.046679,0.157817,0.033297,0.028284,0.054284,0.025275,0.023665,0.041874,0.063071,0.066501,0.065424,0.023837,0.038633,0.049465},
+               {0.120900,0.036460,0.026510,0.040410,0.015980,0.021132,0.025191,0.036369,0.015884,0.111029,0.162852,0.024820,0.028023,0.074058,0.012065,0.041963,0.039072,0.012666,0.040478,0.114137},
+               {0.072639,0.051691,0.038642,0.055580,0.009829,0.031374,0.048731,0.065283,0.023791,0.086640,0.120847,0.052177,0.026728,0.032589,0.039238,0.046748,0.053361,0.008024,0.037426,0.098662},
+               {0.104843,0.078835,0.043513,0.090498,0.002924,0.066163,0.151640,0.038843,0.022556,0.018383,0.038687,0.104462,0.010166,0.009089,0.066950,0.053667,0.049486,0.004409,0.012924,0.031963}};
+            
+            int 
+              i, 
+              j, 
+              r = 0;
+            
+            for(i = 1; i < 20; i++)
+              for(j = 0; j < i; j++)
+                {
+                  daa[i * 20 + j] = rates[lg4_index][r];
+                  r++;
+                }
+            
+            assert(r == 190);
+            
+            for(i = 0; i < 20; i++)
+              f[i] = freqs[lg4_index][i];         
+            
+          }
+          break;
+      	case PLL_LG4X:
+			{
+			  double
+			  rates[4][190] =
+				  {
+				  {
+				  0.295719,
+				  0.067388, 0.448317,
+				  0.253712, 0.457483, 2.358429,
+				  1.029289, 0.576016, 0.251987, 0.189008,
+				  0.107964, 1.741924, 0.216561, 0.599450, 0.029955,
+				  0.514644, 0.736017, 0.503084, 109.901504, 0.084794, 4.117654,
+				  10.868848, 0.704334, 0.435271, 1.070052, 1.862626, 0.246260, 1.202023,
+				  0.380498, 5.658311, 4.873453, 5.229858, 0.553477, 6.508329, 1.634845, 0.404968,
+				  0.084223, 0.123387, 0.090748, 0.052764, 0.151733, 0.054187, 0.060194, 0.048984, 0.204296,
+				  0.086976, 0.221777, 0.033310, 0.021407, 0.230320, 0.195703, 0.069359, 0.069963, 0.504221, 1.495537,
+				  0.188789, 93.433377, 0.746537, 0.621146, 0.096955, 1.669092, 2.448827, 0.256662, 1.991533, 0.091940, 0.122332,
+				  0.286389, 0.382175, 0.128905, 0.081091, 0.352526, 0.810168, 0.232297, 0.228519, 0.655465, 1.994320, 3.256485, 0.457430,
+				  0.155567, 0.235965, 0.127321, 0.205164, 0.590018, 0.066081, 0.064822, 0.241077, 6.799829, 0.754940, 2.261319, 0.163849, 1.559944,
+				  1.671061, 6.535048, 0.904011, 5.164456, 0.386853, 2.437439, 3.537387, 4.320442, 11.291065, 0.170343, 0.848067, 5.260446, 0.426508, 0.438856,
+				  2.132922, 0.525521, 0.939733, 0.747330, 1.559564, 0.165666, 0.435384, 3.656545, 0.961142, 0.050315, 0.064441, 0.360946, 0.132547, 0.306683, 4.586081,
+				  0.529591, 0.303537, 0.435450, 0.308078, 0.606648, 0.106333, 0.290413, 0.290216, 0.448965, 0.372166, 0.102493, 0.389413, 0.498634, 0.109129, 2.099355, 3.634276,
+				  0.115551, 0.641259, 0.046646, 0.260889, 0.587531, 0.093417, 0.280695, 0.307466, 6.227274, 0.206332, 0.459041, 0.033291, 0.559069, 18.392863, 0.411347, 0.101797, 0.034710,
+				  0.102453, 0.289466, 0.262076, 0.185083, 0.592318, 0.035149, 0.105999, 0.096556, 20.304886, 0.097050, 0.133091, 0.115301, 0.264728, 66.647302, 0.476350, 0.148995, 0.063603, 20.561407,
+				  0.916683, 0.102065, 0.043986, 0.080708, 0.885230, 0.072549, 0.206603, 0.306067, 0.205944, 5.381403, 0.561215, 0.112593, 0.693307, 0.400021, 0.584622, 0.089177, 0.755865, 0.133790, 0.154902
+				  },
+				  {
+				  0.066142,
+				  0.590377, 0.468325,
+				  0.069930, 0.013688, 2.851667,
+				  9.850951, 0.302287, 3.932151, 0.146882,
+				  1.101363, 1.353957, 8.159169, 0.249672, 0.582670,
+				  0.150375, 0.028386, 0.219934, 0.560142, 0.005035, 3.054085,
+				  0.568586, 0.037750, 0.421974, 0.046719, 0.275844, 0.129551, 0.037250,
+				  0.051668, 0.262130, 2.468752, 0.106259, 0.098208, 4.210126, 0.029788, 0.013513,
+				  0.127170, 0.016923, 0.344765, 0.003656, 0.445038, 0.165753, 0.008541, 0.002533, 0.031779,
+				  0.292429, 0.064289, 0.210724, 0.004200, 1.217010, 1.088704, 0.014768, 0.005848, 0.064558, 7.278994,
+				  0.071458, 0.855973, 1.172204, 0.014189, 0.033969, 1.889645, 0.125869, 0.031390, 0.065585, 0.029917, 0.042762,
+				  1.218562, 0.079621, 0.763553, 0.009876, 1.988516, 3.344809, 0.056702, 0.021612, 0.079927, 7.918203, 14.799537, 0.259400,
+				  0.075144, 0.011169, 0.082464, 0.002656, 0.681161, 0.111063, 0.004186, 0.004854, 0.095591, 0.450964, 1.506485, 0.009457, 1.375871,
+				  7.169085, 0.161937, 0.726566, 0.040244, 0.825960, 2.067758, 0.110993, 0.129497, 0.196886, 0.169797, 0.637893, 0.090576, 0.457399, 0.143327,
+				  30.139501, 0.276530, 11.149790, 0.267322, 18.762977, 3.547017, 0.201148, 0.976631, 0.408834, 0.104288, 0.123793, 0.292108, 0.598048, 0.328689, 3.478333,
+				  13.461692, 0.161053, 4.782635, 0.053740, 11.949233, 2.466507, 0.139705, 0.053397, 0.126088, 1.578530, 0.641351, 0.297913, 4.418398, 0.125011, 2.984862, 13.974326,
+				  0.021372, 0.081472, 0.058046, 0.006597, 0.286794, 0.188236, 0.009201, 0.019475, 0.037226, 0.015909, 0.154810, 0.017172, 0.239749, 0.562720, 0.061299, 0.154326, 0.060703,
+				  0.045779, 0.036742, 0.498072, 0.027639, 0.534219, 0.203493, 0.012095, 0.004964, 0.452302, 0.094365, 0.140750, 0.021976, 0.168432, 1.414883, 0.077470, 0.224675, 0.123480, 0.447011,
+				  4.270235, 0.030342, 0.258487, 0.012745, 4.336817, 0.281953, 0.043812, 0.015539, 0.016212, 16.179952, 3.416059, 0.032578, 2.950318, 0.227807, 1.050562, 0.112000, 5.294490, 0.033381, 0.045528
+				  },
+				  {
+				  0.733336,
+				  0.558955, 0.597671,
+				  0.503360, 0.058964, 5.581680,
+				  4.149599, 2.863355, 1.279881, 0.225860,
+				  1.415369, 2.872594, 1.335650, 0.434096, 1.043232,
+				  1.367574, 0.258365, 0.397108, 2.292917, 0.209978, 4.534772,
+				  1.263002, 0.366868, 1.840061, 1.024707, 0.823594, 0.377181, 0.496780,
+				  0.994098, 2.578946, 5.739035, 0.821921, 3.039380, 4.877840, 0.532488, 0.398817,
+				  0.517204, 0.358350, 0.284730, 0.027824, 1.463390, 0.370939, 0.232460, 0.008940, 0.349195,
+				  0.775054, 0.672023, 0.109781, 0.021443, 1.983693, 1.298542, 0.169219, 0.043707, 0.838324, 5.102837,
+				  0.763094, 5.349861, 1.612642, 0.088850, 0.397640, 3.509873, 0.755219, 0.436013, 0.888693, 0.561690, 0.401070,
+				  1.890137, 0.691594, 0.466979, 0.060820, 2.831098, 2.646440, 0.379926, 0.087640, 0.488389, 7.010411, 8.929538, 1.357738,
+				  0.540460, 0.063347, 0.141582, 0.018288, 4.102068, 0.087872, 0.020447, 0.064863, 1.385133, 3.054968, 5.525874, 0.043394, 3.135353,
+				  0.200122, 0.032875, 0.019509, 0.042687, 0.059723, 0.072299, 0.023282, 0.036426, 0.050226, 0.039318, 0.067505, 0.023126, 0.012695, 0.015631,
+				  4.972745, 0.821562, 4.670980, 1.199607, 5.901348, 1.139018, 0.503875, 1.673207, 0.962470, 0.204155, 0.273372, 0.567639, 0.570771, 0.458799, 0.233109,
+				  1.825593, 0.580847, 1.967383, 0.420710, 2.034980, 0.864479, 0.577513, 0.124068, 0.502294, 2.653232, 0.437116, 1.048288, 2.319555, 0.151684, 0.077004, 8.113282,
+				  0.450842, 0.661866, 0.088064, 0.037642, 2.600668, 0.390688, 0.109318, 0.218118, 1.065585, 0.564368, 1.927515, 0.120994, 1.856122, 4.154750, 0.011074, 0.377578, 0.222293,
+				  0.526135, 0.265730, 0.581928, 0.141233, 5.413080, 0.322761, 0.153776, 0.039217, 8.351808, 0.854294, 0.940458, 0.180650, 0.975427, 11.429924, 0.026268, 0.429221, 0.273138, 4.731579,
+				  3.839269, 0.395134, 0.145401, 0.090101, 4.193725, 0.625409, 0.696533, 0.104335, 0.377304, 15.559906, 2.508169, 0.449074, 3.404087, 1.457957, 0.052132, 0.260296, 2.903836, 0.564762, 0.681215
+				  },
+				  {
+				  0.658412,
+				  0.566269, 0.540749,
+				  0.854111, 0.058015, 3.060574,
+				  0.884454, 5.851132, 1.279257, 0.160296,
+				  1.309554, 2.294145, 1.438430, 0.482619, 0.992259,
+				  1.272639, 0.182966, 0.431464, 2.992763, 0.086318, 2.130054,
+				  1.874713, 0.684164, 2.075952, 1.296206, 2.149634, 0.571406, 0.507160,
+				  0.552007, 3.192521, 4.840271, 0.841829, 5.103188, 4.137385, 0.351381, 0.679853,
+				  0.227683, 0.528161, 0.644656, 0.031467, 3.775817, 0.437589, 0.189152, 0.025780, 0.665865,
+				  0.581512, 1.128882, 0.266076, 0.048542, 3.954021, 2.071689, 0.217780, 0.082005, 1.266791, 8.904999,
+				  0.695190, 3.010922, 2.084975, 0.132774, 0.190734, 2.498630, 0.767361, 0.326441, 0.680174, 0.652629, 0.440178,
+				  0.967985, 1.012866, 0.720060, 0.133055, 1.776095, 1.763546, 0.278392, 0.343977, 0.717301, 10.091413, 14.013035, 1.082703,
+				  0.344015, 0.227296, 0.291854, 0.056045, 4.495841, 0.116381, 0.092075, 0.195877, 4.001286, 2.671718, 5.069337, 0.091278, 4.643214,
+				  0.978992, 0.156635, 0.028961, 0.209188, 0.264277, 0.296578, 0.177263, 0.217424, 0.362942, 0.086367, 0.539010, 0.172734, 0.121821, 0.161015,
+				  3.427163, 0.878405, 4.071574, 0.925172, 7.063879, 1.033710, 0.451893, 3.057583, 1.189259, 0.359932, 0.742569, 0.693405, 0.584083, 1.531223, 1.287474,
+				  2.333253, 0.802754, 2.258357, 0.360522, 2.221150, 1.283423, 0.653836, 0.377558, 0.964545, 4.797423, 0.780580, 1.422571, 4.216178, 0.599244, 0.444362, 5.231362,
+				  0.154701, 0.830884, 0.073037, 0.094591, 3.017954, 0.312579, 0.074620, 0.401252, 1.350568, 0.336801, 1.331875, 0.068958, 1.677263, 5.832025, 0.076328, 0.548763, 0.208791,
+				  0.221089, 0.431617, 1.238426, 0.313945, 8.558815, 0.305772, 0.181992, 0.072258, 12.869737, 1.021885, 1.531589, 0.163829, 1.575754, 33.873091, 0.079916, 0.831890, 0.307846, 5.910440,
+				  2.088785, 0.456530, 0.199728, 0.118104, 4.310199, 0.681277, 0.752277, 0.241015, 0.531100, 23.029406, 4.414850, 0.481711, 5.046403, 1.914768, 0.466823, 0.382271, 3.717971, 0.282540, 0.964421
+				  }
+				  };
+			  double
+			  freqs[4][20] =
+				  {{0.147383 , 0.017579 , 0.058208 , 0.017707 , 0.026331 , 0.041582 , 0.017494 , 0.027859 , 0.011849 , 0.076971 ,
+				  0.147823 , 0.019535 , 0.037132 , 0.029940 , 0.008059 , 0.088179 , 0.089653 , 0.006477 , 0.032308 , 0.097931},
+				  {0.063139 , 0.066357 , 0.011586 , 0.066571 , 0.010800 , 0.009276 , 0.053984 , 0.146986 , 0.034214 , 0.088822 ,
+				  0.098196 , 0.032390 , 0.021263 , 0.072697 , 0.016761 , 0.020711 , 0.020797 , 0.025463 , 0.045615 , 0.094372},
+				  {0.062457 , 0.066826 , 0.049332 , 0.065270 , 0.006513 , 0.041231 , 0.058965 , 0.080852 , 0.028024 , 0.037024 ,
+				  0.075925 , 0.064131 , 0.019620 , 0.028710 , 0.104579 , 0.056388 , 0.062027 , 0.008241 , 0.033124 , 0.050760},
+				  {0.106471 , 0.074171 , 0.044513 , 0.096390 , 0.002148 , 0.066733 , 0.158908 , 0.037625 , 0.020691 , 0.014608 ,
+				  0.028797 , 0.105352 , 0.007864 , 0.007477 , 0.083595 , 0.055726 , 0.047711 , 0.003975 , 0.010088 , 0.027159}};
+			  int
+			  i,
+			  j,
+			  r = 0;
+			  for(i = 1; i < 20; i++)
+				  for(j = 0; j < i; j++)
+				  {
+					  daa[i * 20 + j] = rates[lg4_index][r];
+					  r++;
+				  }
+			  assert(r == 190);
+			  for(i = 0; i < 20; i++)
+				  f[i] = freqs[lg4_index][i];
+		  }
+		  break;
+        case PLL_MTART:
+          {
+           
+
+            daa[1*20+0]=   0.2;
+            daa[2*20+0]=   0.2;
+           daa[2*20+1]=   0.2;
+           daa[3*20+0]=   1;
+           daa[3*20+1]=   4;
+           daa[3*20+2]=   500;
+           daa[4*20+0]=   254;
+           daa[4*20+1]=   36;
+           daa[4*20+2]=   98;
+           daa[4*20+3]=   11;
+           daa[5*20+0]=   0.2;
+           daa[5*20+1]=   154;
+           daa[5*20+2]=   262;
+           daa[5*20+3]=   0.2;
+           daa[5*20+4]=   0.2;
+           daa[6*20+0]=   0.2;
+           daa[6*20+1]=   0.2;
+           daa[6*20+2]=   183;
+           daa[6*20+3]=   862;
+           daa[6*20+4]=   0.2;
+           daa[6*20+5]=   262;
+           daa[7*20+0]=   200;
+           daa[7*20+1]=   0.2;
+           daa[7*20+2]=   121;
+           daa[7*20+3]=   12;
+           daa[7*20+4]=   81;
+           daa[7*20+5]=   3;
+           daa[7*20+6]=   44;
+           daa[8*20+0]=   0.2;
+           daa[8*20+1]=   41;
+           daa[8*20+2]=   180;
+           daa[8*20+3]=   0.2;
+           daa[8*20+4]=   12;
+           daa[8*20+5]=   314;
+           daa[8*20+6]=   15;
+           daa[8*20+7]=   0.2;
+           daa[9*20+0]=   26;
+           daa[9*20+1]=   2;
+           daa[9*20+2]=   21;
+           daa[9*20+3]=   7;
+           daa[9*20+4]=   63;
+           daa[9*20+5]=   11;
+           daa[9*20+6]=   7;
+           daa[9*20+7]=   3;
+           daa[9*20+8]=   0.2;
+           daa[10*20+0]=  4;
+           daa[10*20+1]=  2;
+           daa[10*20+2]=  13;
+           daa[10*20+3]=  1;
+           daa[10*20+4]=  79;
+           daa[10*20+5]=  16;
+           daa[10*20+6]=  2;
+           daa[10*20+7]=  1;
+           daa[10*20+8]=  6;
+           daa[10*20+9]=  515;
+           daa[11*20+0]=  0.2;
+           daa[11*20+1]=  209;
+           daa[11*20+2]=  467;
+           daa[11*20+3]=  2;
+           daa[11*20+4]=  0.2;
+           daa[11*20+5]=  349;
+           daa[11*20+6]=  106;
+           daa[11*20+7]=  0.2;
+           daa[11*20+8]=  0.2;
+           daa[11*20+9]=  3;
+           daa[11*20+10]= 4;
+           daa[12*20+0]=  121;
+           daa[12*20+1]=  5;
+           daa[12*20+2]=  79;
+           daa[12*20+3]=  0.2;
+           daa[12*20+4]=  312;
+           daa[12*20+5]=  67;
+           daa[12*20+6]=  0.2;
+           daa[12*20+7]=  56;
+           daa[12*20+8]=  0.2;
+           daa[12*20+9]=  515;
+           daa[12*20+10]= 885;
+           daa[12*20+11]= 106;
+           daa[13*20+0]=  13;
+           daa[13*20+1]=  5;
+           daa[13*20+2]=  20;
+           daa[13*20+3]=  0.2;
+           daa[13*20+4]=  184;
+           daa[13*20+5]=  0.2;
+           daa[13*20+6]=  0.2;
+           daa[13*20+7]=  1;
+           daa[13*20+8]=  14;
+           daa[13*20+9]=  118;
+           daa[13*20+10]= 263;
+           daa[13*20+11]= 11;
+           daa[13*20+12]= 322;
+           daa[14*20+0]=  49;
+           daa[14*20+1]=  0.2;
+           daa[14*20+2]=  17;
+           daa[14*20+3]=  0.2;
+           daa[14*20+4]=  0.2;
+           daa[14*20+5]=  39;
+           daa[14*20+6]=  8;
+           daa[14*20+7]=  0.2;
+           daa[14*20+8]=  1;
+           daa[14*20+9]=  0.2;
+           daa[14*20+10]= 12;
+           daa[14*20+11]= 17;
+           daa[14*20+12]= 5;
+           daa[14*20+13]= 15;
+           daa[15*20+0]=  673;
+           daa[15*20+1]=  3;
+           daa[15*20+2]=  398;
+           daa[15*20+3]=  44;
+           daa[15*20+4]=  664;
+           daa[15*20+5]=  52;
+           daa[15*20+6]=  31;
+           daa[15*20+7]=  226;
+           daa[15*20+8]=  11;
+           daa[15*20+9]=  7;
+           daa[15*20+10]= 8;
+           daa[15*20+11]= 144;
+           daa[15*20+12]= 112;
+           daa[15*20+13]= 36;
+           daa[15*20+14]= 87;
+           daa[16*20+0]=  244;
+           daa[16*20+1]=  0.2;
+           daa[16*20+2]=  166;
+           daa[16*20+3]=  0.2;
+           daa[16*20+4]=  183;
+           daa[16*20+5]=  44;
+           daa[16*20+6]=  43;
+           daa[16*20+7]=  0.2;
+           daa[16*20+8]=  19;
+           daa[16*20+9]=  204;
+           daa[16*20+10]= 48;
+           daa[16*20+11]= 70;
+           daa[16*20+12]= 289;
+           daa[16*20+13]= 14;
+           daa[16*20+14]= 47;
+           daa[16*20+15]= 660;
+           daa[17*20+0]=  0.2;
+           daa[17*20+1]=  0.2;
+           daa[17*20+2]=  8;
+           daa[17*20+3]=  0.2;
+           daa[17*20+4]=  22;
+           daa[17*20+5]=  7;
+           daa[17*20+6]=  11;
+           daa[17*20+7]=  2;
+           daa[17*20+8]=  0.2;
+           daa[17*20+9]=  0.2;
+           daa[17*20+10]= 21;
+           daa[17*20+11]= 16;
+           daa[17*20+12]= 71;
+           daa[17*20+13]= 54;
+           daa[17*20+14]= 0.2;
+           daa[17*20+15]= 2;
+           daa[17*20+16]= 0.2;
+           daa[18*20+0]=  1;
+           daa[18*20+1]=  4;
+           daa[18*20+2]=  251;
+           daa[18*20+3]=  0.2;
+           daa[18*20+4]=  72;
+           daa[18*20+5]=  87;
+           daa[18*20+6]=  8;
+           daa[18*20+7]=  9;
+           daa[18*20+8]=  191;
+           daa[18*20+9]=  12;
+           daa[18*20+10]= 20;
+           daa[18*20+11]= 117;
+           daa[18*20+12]= 71;
+           daa[18*20+13]= 792;
+           daa[18*20+14]= 18;
+           daa[18*20+15]= 30;
+           daa[18*20+16]= 46;
+           daa[18*20+17]= 38;
+           daa[19*20+0]=  340;
+           daa[19*20+1]=  0.2;
+           daa[19*20+2]=  23;
+           daa[19*20+3]=  0.2;
+           daa[19*20+4]=  350;
+           daa[19*20+5]=  0.2;
+           daa[19*20+6]=  14;
+           daa[19*20+7]=  3;
+           daa[19*20+8]=  0.2;
+           daa[19*20+9]=  1855;
+           daa[19*20+10]= 85;
+           daa[19*20+11]= 26;
+           daa[19*20+12]= 281;
+           daa[19*20+13]= 52;
+           daa[19*20+14]= 32;
+           daa[19*20+15]= 61;
+           daa[19*20+16]= 544;
+           daa[19*20+17]= 0.2;
+           daa[19*20+18]= 2;
+           
+           f[0]=  0.054116;
+           f[1]=  0.018227;
+           f[2]=  0.039903;
+           f[3]=  0.020160;
+           f[4]=  0.009709;
+           f[5]=  0.018781;
+           f[6]=  0.024289;
+           f[7]=  0.068183;
+           f[8]=  0.024518;
+           f[9]=  0.092638;
+           f[10]= 0.148658;
+           f[11]= 0.021718;
+           f[12]= 0.061453;
+           f[13]= 0.088668;
+           f[14]= 0.041826;
+           f[15]= 0.091030;
+           f[16]= 0.049194;
+           f[17]= 0.029786;
+           f[18]= 0.039443;
+           f[19]= 0.057700;
+          }
+          break;
+        case PLL_MTZOA:
+          {
+           daa[1*20+0]=   3.3;
+           daa[2*20+0]=   1.7;
+           daa[2*20+1]=   33.6;
+           daa[3*20+0]=   16.1;
+           daa[3*20+1]=   3.2;
+           daa[3*20+2]=   617.0;
+           daa[4*20+0]=   272.5;
+           daa[4*20+1]=   61.1;
+           daa[4*20+2]=   94.6;
+           daa[4*20+3]=   9.5;
+           daa[5*20+0]=   7.3;
+           daa[5*20+1]=   231.0;
+           daa[5*20+2]=   190.3;
+           daa[5*20+3]=   19.3;
+           daa[5*20+4]=   49.1;
+           daa[6*20+0]=   17.1;
+           daa[6*20+1]=   6.4;
+           daa[6*20+2]=   174.0;
+           daa[6*20+3]=   883.6;
+           daa[6*20+4]=   3.4;
+           daa[6*20+5]=   349.4;
+           daa[7*20+0]=   289.3;
+           daa[7*20+1]=   7.2;
+           daa[7*20+2]=   99.3;
+           daa[7*20+3]=   26.0;
+           daa[7*20+4]=   82.4;
+           daa[7*20+5]=   8.9;
+           daa[7*20+6]=   43.1;
+           daa[8*20+0]=   2.3;
+           daa[8*20+1]=   61.7;
+           daa[8*20+2]=   228.9;
+           daa[8*20+3]=   55.6;
+           daa[8*20+4]=   37.5;
+           daa[8*20+5]=   421.8;
+           daa[8*20+6]=   14.9;
+           daa[8*20+7]=   7.4;
+           daa[9*20+0]=   33.2;
+           daa[9*20+1]=   0.2;
+           daa[9*20+2]=   24.3;
+           daa[9*20+3]=   1.5;
+           daa[9*20+4]=   48.8;
+           daa[9*20+5]=   0.2;
+           daa[9*20+6]=   7.3;
+           daa[9*20+7]=   3.4;
+           daa[9*20+8]=   1.6;
+           daa[10*20+0]=  15.6;
+           daa[10*20+1]=  4.1;
+           daa[10*20+2]=  7.9;
+           daa[10*20+3]=  0.5;
+           daa[10*20+4]=  59.7;
+           daa[10*20+5]=  23.0;
+           daa[10*20+6]=  1.0;
+           daa[10*20+7]=  3.5;
+           daa[10*20+8]=  6.6;
+           daa[10*20+9]=  425.2;
+           daa[11*20+0]=  0.2;
+           daa[11*20+1]=  292.3;
+           daa[11*20+2]=  413.4;
+           daa[11*20+3]=  0.2;
+           daa[11*20+4]=  0.2;
+           daa[11*20+5]=  334.0;
+           daa[11*20+6]=  163.2;
+           daa[11*20+7]=  10.1;
+           daa[11*20+8]=  23.9;
+           daa[11*20+9]=  8.4;
+           daa[11*20+10]= 6.7;
+           daa[12*20+0]=  136.5;
+           daa[12*20+1]=  3.8;
+           daa[12*20+2]=  73.7;
+           daa[12*20+3]=  0.2;
+           daa[12*20+4]=  264.8;
+           daa[12*20+5]=  83.9;
+           daa[12*20+6]=  0.2;
+           daa[12*20+7]=  52.2;
+           daa[12*20+8]=  7.1;
+           daa[12*20+9]=  449.7;
+           daa[12*20+10]= 636.3;
+           daa[12*20+11]= 83.0;
+           daa[13*20+0]=  26.5;
+           daa[13*20+1]=  0.2;
+           daa[13*20+2]=  12.9;
+           daa[13*20+3]=  2.0;
+           daa[13*20+4]=  167.8;
+           daa[13*20+5]=  9.5;
+           daa[13*20+6]=  0.2;
+           daa[13*20+7]=  5.8;
+           daa[13*20+8]=  13.1;
+           daa[13*20+9]=  90.3;
+           daa[13*20+10]= 234.2;
+           daa[13*20+11]= 16.3;
+           daa[13*20+12]= 215.6;
+           daa[14*20+0]=  61.8;
+           daa[14*20+1]=  7.5;
+           daa[14*20+2]=  22.6;
+           daa[14*20+3]=  0.2;
+           daa[14*20+4]=  8.1;
+           daa[14*20+5]=  52.2;
+           daa[14*20+6]=  20.6;
+           daa[14*20+7]=  1.3;
+           daa[14*20+8]=  15.6;
+           daa[14*20+9]=  2.6;
+           daa[14*20+10]= 11.4;
+           daa[14*20+11]= 24.3;
+           daa[14*20+12]= 5.4;
+           daa[14*20+13]= 10.5;
+           daa[15*20+0]=  644.9;
+           daa[15*20+1]=  11.8;
+           daa[15*20+2]=  420.2;
+           daa[15*20+3]=  51.4;
+           daa[15*20+4]=  656.3;
+           daa[15*20+5]=  96.4;
+           daa[15*20+6]=  38.4;
+           daa[15*20+7]=  257.1;
+           daa[15*20+8]=  23.1;
+           daa[15*20+9]=  7.2;
+           daa[15*20+10]= 15.2;
+           daa[15*20+11]= 144.9;
+           daa[15*20+12]= 95.3;
+           daa[15*20+13]= 32.2;
+           daa[15*20+14]= 79.7;
+           daa[16*20+0]=  378.1;
+           daa[16*20+1]=  3.2;
+           daa[16*20+2]=  184.6;
+           daa[16*20+3]=  2.3;
+           daa[16*20+4]=  199.0;
+           daa[16*20+5]=  39.4;
+           daa[16*20+6]=  34.5;
+           daa[16*20+7]=  5.2;
+           daa[16*20+8]=  19.4;
+           daa[16*20+9]=  222.3;
+           daa[16*20+10]= 50.0;
+           daa[16*20+11]= 75.5;
+           daa[16*20+12]= 305.1;
+           daa[16*20+13]= 19.3;
+           daa[16*20+14]= 56.9;
+           daa[16*20+15]= 666.3;
+           daa[17*20+0]=  3.1;
+           daa[17*20+1]=  16.9;
+           daa[17*20+2]=  6.4;
+           daa[17*20+3]=  0.2;
+           daa[17*20+4]=  36.1;
+           daa[17*20+5]=  6.1;
+           daa[17*20+6]=  3.5;
+           daa[17*20+7]=  12.3;
+           daa[17*20+8]=  4.5;
+           daa[17*20+9]=  9.7;
+           daa[17*20+10]= 27.2;
+           daa[17*20+11]= 6.6;
+           daa[17*20+12]= 48.7;
+           daa[17*20+13]= 58.2;
+           daa[17*20+14]= 1.3;
+           daa[17*20+15]= 10.3;
+           daa[17*20+16]= 3.6;
+           daa[18*20+0]=  2.1;
+           daa[18*20+1]=  13.8;
+           daa[18*20+2]=  141.6;
+           daa[18*20+3]=  13.9;
+           daa[18*20+4]=  76.7;
+           daa[18*20+5]=  52.3;
+           daa[18*20+6]=  10.0;
+           daa[18*20+7]=  4.3;
+           daa[18*20+8]=  266.5;
+           daa[18*20+9]=  13.1;
+           daa[18*20+10]= 5.7;
+           daa[18*20+11]= 45.0;
+           daa[18*20+12]= 41.4;
+           daa[18*20+13]= 590.5;
+           daa[18*20+14]= 4.2;
+           daa[18*20+15]= 29.7;
+           daa[18*20+16]= 29.0;
+           daa[18*20+17]= 79.8;
+           daa[19*20+0]=  321.9;
+           daa[19*20+1]=  5.1;
+           daa[19*20+2]=  7.1;
+           daa[19*20+3]=  3.7;
+           daa[19*20+4]=  243.8;
+           daa[19*20+5]=  9.0;
+           daa[19*20+6]=  16.3;
+           daa[19*20+7]=  23.7;
+           daa[19*20+8]=  0.3;
+           daa[19*20+9]=  1710.6;
+           daa[19*20+10]= 126.1;
+           daa[19*20+11]= 11.1;
+           daa[19*20+12]= 279.6;
+           daa[19*20+13]= 59.6;
+           daa[19*20+14]= 17.9;
+           daa[19*20+15]= 49.5;
+           daa[19*20+16]= 396.4;
+           daa[19*20+17]= 13.7;
+           daa[19*20+18]= 15.6;
+           
+           f[0]=  0.069;
+           f[1]=  0.021;
+           f[2]=  0.030;
+           f[3]=  0.020;
+           f[4]=  0.010;
+           f[5]=  0.019;
+           f[6]=  0.025;
+           f[7]=  0.072;
+           f[8]=  0.027;
+           f[9]=  0.085;
+           f[10]= 0.157;
+           f[11]= 0.019;
+           f[12]= 0.051;
+           f[13]= 0.082;
+           f[14]= 0.045;
+           f[15]= 0.081;
+           f[16]= 0.056;
+           f[17]= 0.028;
+           f[18]= 0.037;
+           f[19]= 0.066;
+          }
+          break;
+        case PLL_PMB:
+          {
+           daa[1*20+0]=   0.674995699;
+           daa[2*20+0]=   0.589645178;
+           daa[2*20+1]=   1.189067034;
+           daa[3*20+0]=   0.462499504;
+           daa[3*20+1]=   0.605460903;
+           daa[3*20+2]=   3.573373315;
+           daa[4*20+0]=   1.065445546;
+           daa[4*20+1]=   0.31444833;
+           daa[4*20+2]=   0.589852457;
+           daa[4*20+3]=   0.246951424;
+           daa[5*20+0]=   1.111766964;
+           daa[5*20+1]=   2.967840934;
+           daa[5*20+2]=   2.299755865;
+           daa[5*20+3]=   1.686058219;
+           daa[5*20+4]=   0.245163782;
+           daa[6*20+0]=   1.046334652;
+           daa[6*20+1]=   1.201770702;
+           daa[6*20+2]=   1.277836748;
+           daa[6*20+3]=   4.399995525;
+           daa[6*20+4]=   0.091071867;
+           daa[6*20+5]=   4.15967899;
+           daa[7*20+0]=   1.587964372;
+           daa[7*20+1]=   0.523770553;
+           daa[7*20+2]=   1.374854049;
+           daa[7*20+3]=   0.734992057;
+           daa[7*20+4]=   0.31706632;
+           daa[7*20+5]=   0.596789898;
+           daa[7*20+6]=   0.463812837;
+           daa[8*20+0]=   0.580830874;
+           daa[8*20+1]=   1.457127446;
+           daa[8*20+2]=   2.283037894;
+           daa[8*20+3]=   0.839348444;
+           daa[8*20+4]=   0.411543728;
+           daa[8*20+5]=   1.812173605;
+           daa[8*20+6]=   0.877842609;
+           daa[8*20+7]=   0.476331437;
+           daa[9*20+0]=   0.464590585;
+           daa[9*20+1]=   0.35964586;
+           daa[9*20+2]=   0.426069419;
+           daa[9*20+3]=   0.266775558;
+           daa[9*20+4]=   0.417547309;
+           daa[9*20+5]=   0.315256838;
+           daa[9*20+6]=   0.30421529;
+           daa[9*20+7]=   0.180198883;
+           daa[9*20+8]=   0.285186418;
+           daa[10*20+0]=  0.804404505;
+           daa[10*20+1]=  0.520701585;
+           daa[10*20+2]=  0.41009447;
+           daa[10*20+3]=  0.269124919;
+           daa[10*20+4]=  0.450795211;
+           daa[10*20+5]=  0.625792937;
+           daa[10*20+6]=  0.32078471;
+           daa[10*20+7]=  0.259854426;
+           daa[10*20+8]=  0.363981358;
+           daa[10*20+9]=  4.162454693;
+           daa[11*20+0]=  0.831998835;
+           daa[11*20+1]=  4.956476453;
+           daa[11*20+2]=  2.037575629;
+           daa[11*20+3]=  1.114178954;
+           daa[11*20+4]=  0.274163536;
+           daa[11*20+5]=  3.521346591;
+           daa[11*20+6]=  2.415974716;
+           daa[11*20+7]=  0.581001076;
+           daa[11*20+8]=  0.985885486;
+           daa[11*20+9]=  0.374784947;
+           daa[11*20+10]= 0.498011337;
+           daa[12*20+0]=  1.546725076;
+           daa[12*20+1]=  0.81346254;
+           daa[12*20+2]=  0.737846301;
+           daa[12*20+3]=  0.341932741;
+           daa[12*20+4]=  0.618614612;
+           daa[12*20+5]=  2.067388546;
+           daa[12*20+6]=  0.531773639;
+           daa[12*20+7]=  0.465349326;
+           daa[12*20+8]=  0.380925433;
+           daa[12*20+9]=  3.65807012;
+           daa[12*20+10]= 5.002338375;
+           daa[12*20+11]= 0.661095832;
+           daa[13*20+0]=  0.546169219;
+           daa[13*20+1]=  0.303437244;
+           daa[13*20+2]=  0.425193716;
+           daa[13*20+3]=  0.219005213;
+           daa[13*20+4]=  0.669206193;
+           daa[13*20+5]=  0.406042546;
+           daa[13*20+6]=  0.224154698;
+           daa[13*20+7]=  0.35402891;
+           daa[13*20+8]=  0.576231691;
+           daa[13*20+9]=  1.495264661;
+           daa[13*20+10]= 2.392638293;
+           daa[13*20+11]= 0.269496317;
+           daa[13*20+12]= 2.306919847;
+           daa[14*20+0]=  1.241586045;
+           daa[14*20+1]=  0.65577338;
+           daa[14*20+2]=  0.711495595;
+           daa[14*20+3]=  0.775624818;
+           daa[14*20+4]=  0.198679914;
+           daa[14*20+5]=  0.850116543;
+           daa[14*20+6]=  0.794584081;
+           daa[14*20+7]=  0.588254139;
+           daa[14*20+8]=  0.456058589;
+           daa[14*20+9]=  0.366232942;
+           daa[14*20+10]= 0.430073179;
+           daa[14*20+11]= 1.036079005;
+           daa[14*20+12]= 0.337502282;
+           daa[14*20+13]= 0.481144863;
+           daa[15*20+0]=  3.452308792;
+           daa[15*20+1]=  0.910144334;
+           daa[15*20+2]=  2.572577221;
+           daa[15*20+3]=  1.440896785;
+           daa[15*20+4]=  0.99870098;
+           daa[15*20+5]=  1.348272505;
+           daa[15*20+6]=  1.205509425;
+           daa[15*20+7]=  1.402122097;
+           daa[15*20+8]=  0.799966711;
+           daa[15*20+9]=  0.530641901;
+           daa[15*20+10]= 0.402471997;
+           daa[15*20+11]= 1.234648153;
+           daa[15*20+12]= 0.945453716;
+           daa[15*20+13]= 0.613230817;
+           daa[15*20+14]= 1.217683028;
+           daa[16*20+0]=  1.751412803;
+           daa[16*20+1]=  0.89517149;
+           daa[16*20+2]=  1.823161023;
+           daa[16*20+3]=  0.994227284;
+           daa[16*20+4]=  0.847312432;
+           daa[16*20+5]=  1.320626678;
+           daa[16*20+6]=  0.949599791;
+           daa[16*20+7]=  0.542185658;
+           daa[16*20+8]=  0.83039281;
+           daa[16*20+9]=  1.114132523;
+           daa[16*20+10]= 0.779827336;
+           daa[16*20+11]= 1.290709079;
+           daa[16*20+12]= 1.551488041;
+           daa[16*20+13]= 0.718895136;
+           daa[16*20+14]= 0.780913179;
+           daa[16*20+15]= 4.448982584;
+           daa[17*20+0]=  0.35011051;
+           daa[17*20+1]=  0.618778365;
+           daa[17*20+2]=  0.422407388;
+           daa[17*20+3]=  0.362495245;
+           daa[17*20+4]=  0.445669347;
+           daa[17*20+5]=  0.72038474;
+           daa[17*20+6]=  0.261258229;
+           daa[17*20+7]=  0.37874827;
+           daa[17*20+8]=  0.72436751;
+           daa[17*20+9]=  0.516260502;
+           daa[17*20+10]= 0.794797115;
+           daa[17*20+11]= 0.43340962;
+           daa[17*20+12]= 0.768395107;
+           daa[17*20+13]= 3.29519344;
+           daa[17*20+14]= 0.499869138;
+           daa[17*20+15]= 0.496334956;
+           daa[17*20+16]= 0.38372361;
+           daa[18*20+0]=  0.573154753;
+           daa[18*20+1]=  0.628599063;
+           daa[18*20+2]=  0.720013799;
+           daa[18*20+3]=  0.436220437;
+           daa[18*20+4]=  0.55626163;
+           daa[18*20+5]=  0.728970584;
+           daa[18*20+6]=  0.50720003;
+           daa[18*20+7]=  0.284727562;
+           daa[18*20+8]=  2.210952064;
+           daa[18*20+9]=  0.570562395;
+           daa[18*20+10]= 0.811019594;
+           daa[18*20+11]= 0.664884513;
+           daa[18*20+12]= 0.93253606;
+           daa[18*20+13]= 5.894735673;
+           daa[18*20+14]= 0.433748126;
+           daa[18*20+15]= 0.593795813;
+           daa[18*20+16]= 0.523549536;
+           daa[18*20+17]= 2.996248013;
+           daa[19*20+0]=  2.063050067;
+           daa[19*20+1]=  0.388680158;
+           daa[19*20+2]=  0.474418852;
+           daa[19*20+3]=  0.275658381;
+           daa[19*20+4]=  0.998911631;
+           daa[19*20+5]=  0.634408285;
+           daa[19*20+6]=  0.527640634;
+           daa[19*20+7]=  0.314700907;
+           daa[19*20+8]=  0.305792277;
+           daa[19*20+9]=  8.002789424;
+           daa[19*20+10]= 2.113077156;
+           daa[19*20+11]= 0.526184203;
+           daa[19*20+12]= 1.737356217;
+           daa[19*20+13]= 0.983844803;
+           daa[19*20+14]= 0.551333603;
+           daa[19*20+15]= 0.507506011;
+           daa[19*20+16]= 1.89965079;
+           daa[19*20+17]= 0.429570747;
+           daa[19*20+18]= 0.716795463;
+           
+           f[0]=  0.076;
+           f[1]=  0.054;
+           f[2]=  0.038;
+           f[3]=  0.045;
+           f[4]=  0.028;
+           f[5]=  0.034;
+           f[6]=  0.053;
+           f[7]=  0.078;
+           f[8]=  0.030;
+           f[9]=  0.060;
+           f[10]= 0.096;
+           f[11]= 0.052;
+           f[12]= 0.022;
+           f[13]= 0.045;
+           f[14]= 0.042;
+           f[15]= 0.068;
+           f[16]= 0.056;
+           f[17]= 0.016;
+           f[18]= 0.036;
+           f[19]= 0.071;
+          }
+          break;
+        case PLL_HIVB:
+          {
+           daa[1*20+0]=   0.30750700;
+           daa[2*20+0]=   0.00500000;
+           daa[2*20+1]=   0.29554300;
+           daa[3*20+0]=   1.45504000;
+           daa[3*20+1]=   0.00500000;
+           daa[3*20+2]=   17.66120000;
+           daa[4*20+0]=   0.12375800;
+           daa[4*20+1]=   0.35172100;
+           daa[4*20+2]=   0.08606420;
+           daa[4*20+3]=   0.00500000;
+           daa[5*20+0]=   0.05511280;
+           daa[5*20+1]=   3.42150000;
+           daa[5*20+2]=   0.67205200;
+           daa[5*20+3]=   0.00500000;
+           daa[5*20+4]=   0.00500000;
+           daa[6*20+0]=   1.48135000;
+           daa[6*20+1]=   0.07492180;
+           daa[6*20+2]=   0.07926330;
+           daa[6*20+3]=   10.58720000;
+           daa[6*20+4]=   0.00500000;
+           daa[6*20+5]=   2.56020000;
+           daa[7*20+0]=   2.13536000;
+           daa[7*20+1]=   3.65345000;
+           daa[7*20+2]=   0.32340100;
+           daa[7*20+3]=   2.83806000;
+           daa[7*20+4]=   0.89787100;
+           daa[7*20+5]=   0.06191370;
+           daa[7*20+6]=   3.92775000;
+           daa[8*20+0]=   0.08476130;
+           daa[8*20+1]=   9.04044000;
+           daa[8*20+2]=   7.64585000;
+           daa[8*20+3]=   1.91690000;
+           daa[8*20+4]=   0.24007300;
+           daa[8*20+5]=   7.05545000;
+           daa[8*20+6]=   0.11974000;
+           daa[8*20+7]=   0.00500000;
+           daa[9*20+0]=   0.00500000;
+           daa[9*20+1]=   0.67728900;
+           daa[9*20+2]=   0.68056500;
+           daa[9*20+3]=   0.01767920;
+           daa[9*20+4]=   0.00500000;
+           daa[9*20+5]=   0.00500000;
+           daa[9*20+6]=   0.00609079;
+           daa[9*20+7]=   0.00500000;
+           daa[9*20+8]=   0.10311100;
+           daa[10*20+0]=  0.21525600;
+           daa[10*20+1]=  0.70142700;
+           daa[10*20+2]=  0.00500000;
+           daa[10*20+3]=  0.00876048;
+           daa[10*20+4]=  0.12977700;
+           daa[10*20+5]=  1.49456000;
+           daa[10*20+6]=  0.00500000;
+           daa[10*20+7]=  0.00500000;
+           daa[10*20+8]=  1.74171000;
+           daa[10*20+9]=  5.95879000;
+           daa[11*20+0]=  0.00500000;
+           daa[11*20+1]=  20.45000000;
+           daa[11*20+2]=  7.90443000;
+           daa[11*20+3]=  0.00500000;
+           daa[11*20+4]=  0.00500000;
+           daa[11*20+5]=  6.54737000;
+           daa[11*20+6]=  4.61482000;
+           daa[11*20+7]=  0.52170500;
+           daa[11*20+8]=  0.00500000;
+           daa[11*20+9]=  0.32231900;
+           daa[11*20+10]= 0.08149950;
+           daa[12*20+0]=  0.01866430;
+           daa[12*20+1]=  2.51394000;
+           daa[12*20+2]=  0.00500000;
+           daa[12*20+3]=  0.00500000;
+           daa[12*20+4]=  0.00500000;
+           daa[12*20+5]=  0.30367600;
+           daa[12*20+6]=  0.17578900;
+           daa[12*20+7]=  0.00500000;
+           daa[12*20+8]=  0.00500000;
+           daa[12*20+9]=  11.20650000;
+           daa[12*20+10]= 5.31961000;
+           daa[12*20+11]= 1.28246000;
+           daa[13*20+0]=  0.01412690;
+           daa[13*20+1]=  0.00500000;
+           daa[13*20+2]=  0.00500000;
+           daa[13*20+3]=  0.00500000;
+           daa[13*20+4]=  9.29815000;
+           daa[13*20+5]=  0.00500000;
+           daa[13*20+6]=  0.00500000;
+           daa[13*20+7]=  0.29156100;
+           daa[13*20+8]=  0.14555800;
+           daa[13*20+9]=  3.39836000;
+           daa[13*20+10]= 8.52484000;
+           daa[13*20+11]= 0.03426580;
+           daa[13*20+12]= 0.18802500;
+           daa[14*20+0]=  2.12217000;
+           daa[14*20+1]=  1.28355000;
+           daa[14*20+2]=  0.00739578;
+           daa[14*20+3]=  0.03426580;
+           daa[14*20+4]=  0.00500000;
+           daa[14*20+5]=  4.47211000;
+           daa[14*20+6]=  0.01202260;
+           daa[14*20+7]=  0.00500000;
+           daa[14*20+8]=  2.45318000;
+           daa[14*20+9]=  0.04105930;
+           daa[14*20+10]= 2.07757000;
+           daa[14*20+11]= 0.03138620;
+           daa[14*20+12]= 0.00500000;
+           daa[14*20+13]= 0.00500000;
+           daa[15*20+0]=  2.46633000;
+           daa[15*20+1]=  3.47910000;
+           daa[15*20+2]=  13.14470000;
+           daa[15*20+3]=  0.52823000;
+           daa[15*20+4]=  4.69314000;
+           daa[15*20+5]=  0.11631100;
+           daa[15*20+6]=  0.00500000;
+           daa[15*20+7]=  4.38041000;
+           daa[15*20+8]=  0.38274700;
+           daa[15*20+9]=  1.21803000;
+           daa[15*20+10]= 0.92765600;
+           daa[15*20+11]= 0.50411100;
+           daa[15*20+12]= 0.00500000;
+           daa[15*20+13]= 0.95647200;
+           daa[15*20+14]= 5.37762000;
+           daa[16*20+0]=  15.91830000;
+           daa[16*20+1]=  2.86868000;
+           daa[16*20+2]=  6.88667000;
+           daa[16*20+3]=  0.27472400;
+           daa[16*20+4]=  0.73996900;
+           daa[16*20+5]=  0.24358900;
+           daa[16*20+6]=  0.28977400;
+           daa[16*20+7]=  0.36961500;
+           daa[16*20+8]=  0.71159400;
+           daa[16*20+9]=  8.61217000;
+           daa[16*20+10]= 0.04376730;
+           daa[16*20+11]= 4.67142000;
+           daa[16*20+12]= 4.94026000;
+           daa[16*20+13]= 0.01412690;
+           daa[16*20+14]= 2.01417000;
+           daa[16*20+15]= 8.93107000;
+           daa[17*20+0]=  0.00500000;
+           daa[17*20+1]=  0.99133800;
+           daa[17*20+2]=  0.00500000;
+           daa[17*20+3]=  0.00500000;
+           daa[17*20+4]=  2.63277000;
+           daa[17*20+5]=  0.02665600;
+           daa[17*20+6]=  0.00500000;
+           daa[17*20+7]=  1.21674000;
+           daa[17*20+8]=  0.06951790;
+           daa[17*20+9]=  0.00500000;
+           daa[17*20+10]= 0.74884300;
+           daa[17*20+11]= 0.00500000;
+           daa[17*20+12]= 0.08907800;
+           daa[17*20+13]= 0.82934300;
+           daa[17*20+14]= 0.04445060;
+           daa[17*20+15]= 0.02487280;
+           daa[17*20+16]= 0.00500000;
+           daa[18*20+0]=  0.00500000;
+           daa[18*20+1]=  0.00991826;
+           daa[18*20+2]=  1.76417000;
+           daa[18*20+3]=  0.67465300;
+           daa[18*20+4]=  7.57932000;
+           daa[18*20+5]=  0.11303300;
+           daa[18*20+6]=  0.07926330;
+           daa[18*20+7]=  0.00500000;
+           daa[18*20+8]=  18.69430000;
+           daa[18*20+9]=  0.14816800;
+           daa[18*20+10]= 0.11198600;
+           daa[18*20+11]= 0.00500000;
+           daa[18*20+12]= 0.00500000;
+           daa[18*20+13]= 15.34000000;
+           daa[18*20+14]= 0.03043810;
+           daa[18*20+15]= 0.64802400;
+           daa[18*20+16]= 0.10565200;
+           daa[18*20+17]= 1.28022000;
+           daa[19*20+0]=  7.61428000;
+           daa[19*20+1]=  0.08124540;
+           daa[19*20+2]=  0.02665600;
+           daa[19*20+3]=  1.04793000;
+           daa[19*20+4]=  0.42002700;
+           daa[19*20+5]=  0.02091530;
+           daa[19*20+6]=  1.02847000;
+           daa[19*20+7]=  0.95315500;
+           daa[19*20+8]=  0.00500000;
+           daa[19*20+9]=  17.73890000;
+           daa[19*20+10]= 1.41036000;
+           daa[19*20+11]= 0.26582900;
+           daa[19*20+12]= 6.85320000;
+           daa[19*20+13]= 0.72327400;
+           daa[19*20+14]= 0.00500000;
+           daa[19*20+15]= 0.07492180;
+           daa[19*20+16]= 0.70922600;
+           daa[19*20+17]= 0.00500000;
+           daa[19*20+18]= 0.04105930;
+           
+	   f[0]= 0.060490222;           f[1]= 0.066039665;           f[2]= 0.044127815;           f[3]= 0.042109048;
+           f[4]= 0.020075899;           f[5]= 0.053606488;           f[6]= 0.071567447;           f[7]= 0.072308239;
+           f[8]= 0.022293943;           f[9]= 0.069730629;           f[10]= 0.098851122;          f[11]= 0.056968211;
+           f[12]= 0.019768318;          f[13]= 0.028809447;          f[14]= 0.046025282;          f[15]= 0.05060433;
+           f[16]= 0.053636813;          f[17]= 0.033011601;          f[18]= 0.028350243;          f[19]= 0.061625237;
+          }
+          break;
+        case PLL_HIVW:
+          {
+           daa[1*20+0]=   0.0744808;
+           daa[2*20+0]=   0.6175090;
+           daa[2*20+1]=   0.1602400;
+           daa[3*20+0]=   4.4352100;
+           daa[3*20+1]=   0.0674539;
+           daa[3*20+2]=   29.4087000;
+           daa[4*20+0]=   0.1676530;
+           daa[4*20+1]=   2.8636400;
+           daa[4*20+2]=   0.0604932;
+           daa[4*20+3]=   0.0050000;
+           daa[5*20+0]=   0.0050000;
+           daa[5*20+1]=   10.6746000;
+           daa[5*20+2]=   0.3420680;
+           daa[5*20+3]=   0.0050000;
+           daa[5*20+4]=   0.0050000;
+           daa[6*20+0]=   5.5632500;
+           daa[6*20+1]=   0.0251632;
+           daa[6*20+2]=   0.2015260;
+           daa[6*20+3]=   12.1233000;
+           daa[6*20+4]=   0.0050000;
+           daa[6*20+5]=   3.2065600;
+           daa[7*20+0]=   1.8685000;
+           daa[7*20+1]=   13.4379000;
+           daa[7*20+2]=   0.0604932;
+           daa[7*20+3]=   10.3969000;
+           daa[7*20+4]=   0.0489798;
+           daa[7*20+5]=   0.0604932;
+           daa[7*20+6]=   14.7801000;
+           daa[8*20+0]=   0.0050000;
+           daa[8*20+1]=   6.8440500;
+           daa[8*20+2]=   8.5987600;
+           daa[8*20+3]=   2.3177900;
+           daa[8*20+4]=   0.0050000;
+           daa[8*20+5]=   18.5465000;
+           daa[8*20+6]=   0.0050000;
+           daa[8*20+7]=   0.0050000;
+           daa[9*20+0]=   0.0050000;
+           daa[9*20+1]=   1.3406900;
+           daa[9*20+2]=   0.9870280;
+           daa[9*20+3]=   0.1451240;
+           daa[9*20+4]=   0.0050000;
+           daa[9*20+5]=   0.0342252;
+           daa[9*20+6]=   0.0390512;
+           daa[9*20+7]=   0.0050000;
+           daa[9*20+8]=   0.0050000;
+           daa[10*20+0]=  0.1602400;
+           daa[10*20+1]=  0.5867570;
+           daa[10*20+2]=  0.0050000;
+           daa[10*20+3]=  0.0050000;
+           daa[10*20+4]=  0.0050000;
+           daa[10*20+5]=  2.8904800;
+           daa[10*20+6]=  0.1298390;
+           daa[10*20+7]=  0.0489798;
+           daa[10*20+8]=  1.7638200;
+           daa[10*20+9]=  9.1024600;
+           daa[11*20+0]=  0.5927840;
+           daa[11*20+1]=  39.8897000;
+           daa[11*20+2]=  10.6655000;
+           daa[11*20+3]=  0.8943130;
+           daa[11*20+4]=  0.0050000;
+           daa[11*20+5]=  13.0705000;
+           daa[11*20+6]=  23.9626000;
+           daa[11*20+7]=  0.2794250;
+           daa[11*20+8]=  0.2240600;
+           daa[11*20+9]=  0.8174810;
+           daa[11*20+10]= 0.0050000;
+           daa[12*20+0]=  0.0050000;
+           daa[12*20+1]=  3.2865200;
+           daa[12*20+2]=  0.2015260;
+           daa[12*20+3]=  0.0050000;
+           daa[12*20+4]=  0.0050000;
+           daa[12*20+5]=  0.0050000;
+           daa[12*20+6]=  0.0050000;
+           daa[12*20+7]=  0.0489798;
+           daa[12*20+8]=  0.0050000;
+           daa[12*20+9]=  17.3064000;
+           daa[12*20+10]= 11.3839000;
+           daa[12*20+11]= 4.0956400;
+           daa[13*20+0]=  0.5979230;
+           daa[13*20+1]=  0.0050000;
+           daa[13*20+2]=  0.0050000;
+           daa[13*20+3]=  0.0050000;
+           daa[13*20+4]=  0.3629590;
+           daa[13*20+5]=  0.0050000;
+           daa[13*20+6]=  0.0050000;
+           daa[13*20+7]=  0.0050000;
+           daa[13*20+8]=  0.0050000;
+           daa[13*20+9]=  1.4828800;
+           daa[13*20+10]= 7.4878100;
+           daa[13*20+11]= 0.0050000;
+           daa[13*20+12]= 0.0050000;
+           daa[14*20+0]=  1.0098100;
+           daa[14*20+1]=  0.4047230;
+           daa[14*20+2]=  0.3448480;
+           daa[14*20+3]=  0.0050000;
+           daa[14*20+4]=  0.0050000;
+           daa[14*20+5]=  3.0450200;
+           daa[14*20+6]=  0.0050000;
+           daa[14*20+7]=  0.0050000;
+           daa[14*20+8]=  13.9444000;
+           daa[14*20+9]=  0.0050000;
+           daa[14*20+10]= 9.8309500;
+           daa[14*20+11]= 0.1119280;
+           daa[14*20+12]= 0.0050000;
+           daa[14*20+13]= 0.0342252;
+           daa[15*20+0]=  8.5942000;
+           daa[15*20+1]=  8.3502400;
+           daa[15*20+2]=  14.5699000;
+           daa[15*20+3]=  0.4278810;
+           daa[15*20+4]=  1.1219500;
+           daa[15*20+5]=  0.1602400;
+           daa[15*20+6]=  0.0050000;
+           daa[15*20+7]=  6.2796600;
+           daa[15*20+8]=  0.7251570;
+           daa[15*20+9]=  0.7400910;
+           daa[15*20+10]= 6.1439600;
+           daa[15*20+11]= 0.0050000;
+           daa[15*20+12]= 0.3925750;
+           daa[15*20+13]= 4.2793900;
+           daa[15*20+14]= 14.2490000;
+           daa[16*20+0]=  24.1422000;
+           daa[16*20+1]=  0.9282030;
+           daa[16*20+2]=  4.5420600;
+           daa[16*20+3]=  0.6303950;
+           daa[16*20+4]=  0.0050000;
+           daa[16*20+5]=  0.2030910;
+           daa[16*20+6]=  0.4587430;
+           daa[16*20+7]=  0.0489798;
+           daa[16*20+8]=  0.9595600;
+           daa[16*20+9]=  9.3634500;
+           daa[16*20+10]= 0.0050000;
+           daa[16*20+11]= 4.0480200;
+           daa[16*20+12]= 7.4131300;
+           daa[16*20+13]= 0.1145120;
+           daa[16*20+14]= 4.3370100;
+           daa[16*20+15]= 6.3407900;
+           daa[17*20+0]=  0.0050000;
+           daa[17*20+1]=  5.9656400;
+           daa[17*20+2]=  0.0050000;
+           daa[17*20+3]=  0.0050000;
+           daa[17*20+4]=  5.4989400;
+           daa[17*20+5]=  0.0443298;
+           daa[17*20+6]=  0.0050000;
+           daa[17*20+7]=  2.8258000;
+           daa[17*20+8]=  0.0050000;
+           daa[17*20+9]=  0.0050000;
+           daa[17*20+10]= 1.3703100;
+           daa[17*20+11]= 0.0050000;
+           daa[17*20+12]= 0.0050000;
+           daa[17*20+13]= 0.0050000;
+           daa[17*20+14]= 0.0050000;
+           daa[17*20+15]= 1.1015600;
+           daa[17*20+16]= 0.0050000;
+           daa[18*20+0]=  0.0050000;
+           daa[18*20+1]=  0.0050000;
+           daa[18*20+2]=  5.0647500;
+           daa[18*20+3]=  2.2815400;
+           daa[18*20+4]=  8.3483500;
+           daa[18*20+5]=  0.0050000;
+           daa[18*20+6]=  0.0050000;
+           daa[18*20+7]=  0.0050000;
+           daa[18*20+8]=  47.4889000;
+           daa[18*20+9]=  0.1145120;
+           daa[18*20+10]= 0.0050000;
+           daa[18*20+11]= 0.0050000;
+           daa[18*20+12]= 0.5791980;
+           daa[18*20+13]= 4.1272800;
+           daa[18*20+14]= 0.0050000;
+           daa[18*20+15]= 0.9331420;
+           daa[18*20+16]= 0.4906080;
+           daa[18*20+17]= 0.0050000;
+           daa[19*20+0]=  24.8094000;
+           daa[19*20+1]=  0.2794250;
+           daa[19*20+2]=  0.0744808;
+           daa[19*20+3]=  2.9178600;
+           daa[19*20+4]=  0.0050000;
+           daa[19*20+5]=  0.0050000;
+           daa[19*20+6]=  2.1995200;
+           daa[19*20+7]=  2.7962200;
+           daa[19*20+8]=  0.8274790;
+           daa[19*20+9]=  24.8231000;
+           daa[19*20+10]= 2.9534400;
+           daa[19*20+11]= 0.1280650;
+           daa[19*20+12]= 14.7683000;
+           daa[19*20+13]= 2.2800000;
+           daa[19*20+14]= 0.0050000;
+           daa[19*20+15]= 0.8626370;
+           daa[19*20+16]= 0.0050000;
+           daa[19*20+17]= 0.0050000;
+           daa[19*20+18]= 1.3548200;
+           
+	   f[0]= 0.0377494;             f[1]= 0.057321;              f[2]= 0.0891129;             f[3]= 0.0342034;
+           f[4]= 0.0240105;             f[5]= 0.0437824;             f[6]= 0.0618606;             f[7]= 0.0838496;
+           f[8]= 0.0156076;             f[9]= 0.0983641;             f[10]= 0.0577867;            f[11]= 0.0641682;
+           f[12]= 0.0158419;            f[13]= 0.0422741;            f[14]= 0.0458601;            f[15]= 0.0550846;
+           f[16]= 0.0813774;            f[17]= 0.019597;             f[18]= 0.0205847;            f[19]= 0.0515638;
+          }
+          break;
+        case PLL_JTTDCMUT:
+          {
+           daa[1*20+0]=   0.531678;
+           daa[2*20+0]=   0.557967;
+           daa[2*20+1]=   0.451095;
+           daa[3*20+0]=   0.827445;
+           daa[3*20+1]=   0.154899;
+           daa[3*20+2]=   5.549530;
+           daa[4*20+0]=   0.574478;
+           daa[4*20+1]=   1.019843;
+           daa[4*20+2]=   0.313311;
+           daa[4*20+3]=   0.105625;
+           daa[5*20+0]=   0.556725;
+           daa[5*20+1]=   3.021995;
+           daa[5*20+2]=   0.768834;
+           daa[5*20+3]=   0.521646;
+           daa[5*20+4]=   0.091304;
+           daa[6*20+0]=   1.066681;
+           daa[6*20+1]=   0.318483;
+           daa[6*20+2]=   0.578115;
+           daa[6*20+3]=   7.766557;
+           daa[6*20+4]=   0.053907;
+           daa[6*20+5]=   3.417706;
+           daa[7*20+0]=   1.740159;
+           daa[7*20+1]=   1.359652;
+           daa[7*20+2]=   0.773313;
+           daa[7*20+3]=   1.272434;
+           daa[7*20+4]=   0.546389;
+           daa[7*20+5]=   0.231294;
+           daa[7*20+6]=   1.115632;
+           daa[8*20+0]=   0.219970;
+           daa[8*20+1]=   3.210671;
+           daa[8*20+2]=   4.025778;
+           daa[8*20+3]=   1.032342;
+           daa[8*20+4]=   0.724998;
+           daa[8*20+5]=   5.684080;
+           daa[8*20+6]=   0.243768;
+           daa[8*20+7]=   0.201696;
+           daa[9*20+0]=   0.361684;
+           daa[9*20+1]=   0.239195;
+           daa[9*20+2]=   0.491003;
+           daa[9*20+3]=   0.115968;
+           daa[9*20+4]=   0.150559;
+           daa[9*20+5]=   0.078270;
+           daa[9*20+6]=   0.111773;
+           daa[9*20+7]=   0.053769;
+           daa[9*20+8]=   0.181788;
+           daa[10*20+0]=  0.310007;
+           daa[10*20+1]=  0.372261;
+           daa[10*20+2]=  0.137289;
+           daa[10*20+3]=  0.061486;
+           daa[10*20+4]=  0.164593;
+           daa[10*20+5]=  0.709004;
+           daa[10*20+6]=  0.097485;
+           daa[10*20+7]=  0.069492;
+           daa[10*20+8]=  0.540571;
+           daa[10*20+9]=  2.335139;
+           daa[11*20+0]=  0.369437;
+           daa[11*20+1]=  6.529255;
+           daa[11*20+2]=  2.529517;
+           daa[11*20+3]=  0.282466;
+           daa[11*20+4]=  0.049009;
+           daa[11*20+5]=  2.966732;
+           daa[11*20+6]=  1.731684;
+           daa[11*20+7]=  0.269840;
+           daa[11*20+8]=  0.525096;
+           daa[11*20+9]=  0.202562;
+           daa[11*20+10]= 0.146481;
+           daa[12*20+0]=  0.469395;
+           daa[12*20+1]=  0.431045;
+           daa[12*20+2]=  0.330720;
+           daa[12*20+3]=  0.190001;
+           daa[12*20+4]=  0.409202;
+           daa[12*20+5]=  0.456901;
+           daa[12*20+6]=  0.175084;
+           daa[12*20+7]=  0.130379;
+           daa[12*20+8]=  0.329660;
+           daa[12*20+9]=  4.831666;
+           daa[12*20+10]= 3.856906;
+           daa[12*20+11]= 0.624581;
+           daa[13*20+0]=  0.138293;
+           daa[13*20+1]=  0.065314;
+           daa[13*20+2]=  0.073481;
+           daa[13*20+3]=  0.032522;
+           daa[13*20+4]=  0.678335;
+           daa[13*20+5]=  0.045683;
+           daa[13*20+6]=  0.043829;
+           daa[13*20+7]=  0.050212;
+           daa[13*20+8]=  0.453428;
+           daa[13*20+9]=  0.777090;
+           daa[13*20+10]= 2.500294;
+           daa[13*20+11]= 0.024521;
+           daa[13*20+12]= 0.436181;
+           daa[14*20+0]=  1.959599;
+           daa[14*20+1]=  0.710489;
+           daa[14*20+2]=  0.121804;
+           daa[14*20+3]=  0.127164;
+           daa[14*20+4]=  0.123653;
+           daa[14*20+5]=  1.608126;
+           daa[14*20+6]=  0.191994;
+           daa[14*20+7]=  0.208081;
+           daa[14*20+8]=  1.141961;
+           daa[14*20+9]=  0.098580;
+           daa[14*20+10]= 1.060504;
+           daa[14*20+11]= 0.216345;
+           daa[14*20+12]= 0.164215;
+           daa[14*20+13]= 0.148483;
+           daa[15*20+0]=  3.887095;
+           daa[15*20+1]=  1.001551;
+           daa[15*20+2]=  5.057964;
+           daa[15*20+3]=  0.589268;
+           daa[15*20+4]=  2.155331;
+           daa[15*20+5]=  0.548807;
+           daa[15*20+6]=  0.312449;
+           daa[15*20+7]=  1.874296;
+           daa[15*20+8]=  0.743458;
+           daa[15*20+9]=  0.405119;
+           daa[15*20+10]= 0.592511;
+           daa[15*20+11]= 0.474478;
+           daa[15*20+12]= 0.285564;
+           daa[15*20+13]= 0.943971;
+           daa[15*20+14]= 2.788406;
+           daa[16*20+0]=  4.582565;
+           daa[16*20+1]=  0.650282;
+           daa[16*20+2]=  2.351311;
+           daa[16*20+3]=  0.425159;
+           daa[16*20+4]=  0.469823;
+           daa[16*20+5]=  0.523825;
+           daa[16*20+6]=  0.331584;
+           daa[16*20+7]=  0.316862;
+           daa[16*20+8]=  0.477355;
+           daa[16*20+9]=  2.553806;
+           daa[16*20+10]= 0.272514;
+           daa[16*20+11]= 0.965641;
+           daa[16*20+12]= 2.114728;
+           daa[16*20+13]= 0.138904;
+           daa[16*20+14]= 1.176961;
+           daa[16*20+15]= 4.777647;
+           daa[17*20+0]=  0.084329;
+           daa[17*20+1]=  1.257961;
+           daa[17*20+2]=  0.027700;
+           daa[17*20+3]=  0.057466;
+           daa[17*20+4]=  1.104181;
+           daa[17*20+5]=  0.172206;
+           daa[17*20+6]=  0.114381;
+           daa[17*20+7]=  0.544180;
+           daa[17*20+8]=  0.128193;
+           daa[17*20+9]=  0.134510;
+           daa[17*20+10]= 0.530324;
+           daa[17*20+11]= 0.089134;
+           daa[17*20+12]= 0.201334;
+           daa[17*20+13]= 0.537922;
+           daa[17*20+14]= 0.069965;
+           daa[17*20+15]= 0.310927;
+           daa[17*20+16]= 0.080556;
+           daa[18*20+0]=  0.139492;
+           daa[18*20+1]=  0.235601;
+           daa[18*20+2]=  0.700693;
+           daa[18*20+3]=  0.453952;
+           daa[18*20+4]=  2.114852;
+           daa[18*20+5]=  0.254745;
+           daa[18*20+6]=  0.063452;
+           daa[18*20+7]=  0.052500;
+           daa[18*20+8]=  5.848400;
+           daa[18*20+9]=  0.303445;
+           daa[18*20+10]= 0.241094;
+           daa[18*20+11]= 0.087904;
+           daa[18*20+12]= 0.189870;
+           daa[18*20+13]= 5.484236;
+           daa[18*20+14]= 0.113850;
+           daa[18*20+15]= 0.628608;
+           daa[18*20+16]= 0.201094;
+           daa[18*20+17]= 0.747889;
+           daa[19*20+0]=  2.924161;
+           daa[19*20+1]=  0.171995;
+           daa[19*20+2]=  0.164525;
+           daa[19*20+3]=  0.315261;
+           daa[19*20+4]=  0.621323;
+           daa[19*20+5]=  0.179771;
+           daa[19*20+6]=  0.465271;
+           daa[19*20+7]=  0.470140;
+           daa[19*20+8]=  0.121827;
+           daa[19*20+9]=  9.533943;
+           daa[19*20+10]= 1.761439;
+           daa[19*20+11]= 0.124066;
+           daa[19*20+12]= 3.038533;
+           daa[19*20+13]= 0.593478;
+           daa[19*20+14]= 0.211561;
+           daa[19*20+15]= 0.408532;
+           daa[19*20+16]= 1.143980;
+           daa[19*20+17]= 0.239697;
+           daa[19*20+18]= 0.165473;
+           
+           f[0]=  0.077;
+           f[1]=  0.051;
+           f[2]=  0.043;
+           f[3]=  0.051;
+           f[4]=  0.020;
+           f[5]=  0.041;
+           f[6]=  0.062;
+           f[7]=  0.075;
+           f[8]=  0.023;
+           f[9]=  0.053;
+           f[10]= 0.091;
+           f[11]= 0.059;
+           f[12]= 0.024;
+           f[13]= 0.040;
+           f[14]= 0.051;
+           f[15]= 0.068;
+           f[16]= 0.059;
+           f[17]= 0.014;
+           f[18]= 0.032;
+           f[19]= 0.066;
+          }
+          break;
+        case PLL_FLU:
+          {
+            daa[ 1*20+ 0]       =       0.138658765     ;
+            daa[ 2*20+ 0]       =       0.053366579     ;
+            daa[ 2*20+ 1]       =       0.161000889     ;
+            daa[ 3*20+ 0]       =       0.584852306     ;
+            daa[ 3*20+ 1]       =       0.006771843     ;
+            daa[ 3*20+ 2]       =       7.737392871     ;
+            daa[ 4*20+ 0]       =       0.026447095     ;
+            daa[ 4*20+ 1]       =       0.167207008     ;
+            daa[ 4*20+ 2]       =       1.30E-05        ;
+            daa[ 4*20+ 3]       =       1.41E-02        ;
+            daa[ 5*20+ 0]       =       0.353753982     ;
+            daa[ 5*20+ 1]       =       3.292716942     ;
+            daa[ 5*20+ 2]       =       0.530642655     ;
+            daa[ 5*20+ 3]       =       0.145469388     ;
+            daa[ 5*20+ 4]       =       0.002547334     ;
+            daa[ 6*20+ 0]       =       1.484234503     ;
+            daa[ 6*20+ 1]       =       0.124897617     ;
+            daa[ 6*20+ 2]       =       0.061652192     ;
+            daa[ 6*20+ 3]       =       5.370511279     ;
+            daa[ 6*20+ 4]       =       3.91E-11        ;
+            daa[ 6*20+ 5]       =       1.195629122     ;
+            daa[ 7*20+ 0]       =       1.132313122     ;
+            daa[ 7*20+ 1]       =       1.190624465     ;
+            daa[ 7*20+ 2]       =       0.322524648     ;
+            daa[ 7*20+ 3]       =       1.934832784     ;
+            daa[ 7*20+ 4]       =       0.116941459     ;
+            daa[ 7*20+ 5]       =       0.108051341     ;
+            daa[ 7*20+ 6]       =       1.593098825     ;
+            daa[ 8*20+ 0]       =       0.214757862     ;
+            daa[ 8*20+ 1]       =       1.879569938     ;
+            daa[ 8*20+ 2]       =       1.387096032     ;
+            daa[ 8*20+ 3]       =       0.887570549     ;
+            daa[ 8*20+ 4]       =       2.18E-02        ;
+            daa[ 8*20+ 5]       =       5.330313412     ;
+            daa[ 8*20+ 6]       =       0.256491863     ;
+            daa[ 8*20+ 7]       =       0.058774527     ;
+            daa[ 9*20+ 0]       =       0.149926734     ;
+            daa[ 9*20+ 1]       =       0.246117172     ;
+            daa[ 9*20+ 2]       =       0.218571975     ;
+            daa[ 9*20+ 3]       =       0.014085917     ;
+            daa[ 9*20+ 4]       =       0.001112158     ;
+            daa[ 9*20+ 5]       =       0.02883995      ;
+            daa[ 9*20+ 6]       =       1.42E-02        ;
+            daa[ 9*20+ 7]       =       1.63E-05        ;
+            daa[ 9*20+ 8]       =       0.243190142     ;
+            daa[10*20+ 0]       =       0.023116952     ;
+            daa[10*20+ 1]       =       0.296045557     ;
+            daa[10*20+ 2]       =       8.36E-04        ;
+            daa[10*20+ 3]       =       0.005730682     ;
+            daa[10*20+ 4]       =       0.005613627     ;
+            daa[10*20+ 5]       =       1.020366955     ;
+            daa[10*20+ 6]       =       0.016499536     ;
+            daa[10*20+ 7]       =       0.006516229     ;
+            daa[10*20+ 8]       =       0.321611694     ;
+            daa[10*20+ 9]       =       3.512072282     ;
+            daa[11*20+ 0]       =       0.47433361      ;
+            daa[11*20+ 1]       =       15.30009662     ;
+            daa[11*20+ 2]       =       2.646847965     ;
+            daa[11*20+ 3]       =       0.29004298      ;
+            daa[11*20+ 4]       =       3.83E-06        ;
+            daa[11*20+ 5]       =       2.559587177     ;
+            daa[11*20+ 6]       =       3.881488809     ;
+            daa[11*20+ 7]       =       0.264148929     ;
+            daa[11*20+ 8]       =       0.347302791     ;
+            daa[11*20+ 9]       =       0.227707997     ;
+            daa[11*20+10]       =       0.129223639     ;
+            daa[12*20+ 0]       =       0.058745423     ;
+            daa[12*20+ 1]       =       0.890162346     ;
+            daa[12*20+ 2]       =       0.005251688     ;
+            daa[12*20+ 3]       =       0.041762964     ;
+            daa[12*20+ 4]       =       0.11145731      ;
+            daa[12*20+ 5]       =       0.190259181     ;
+            daa[12*20+ 6]       =       0.313974351     ;
+            daa[12*20+ 7]       =       0.001500467     ;
+            daa[12*20+ 8]       =       0.001273509     ;
+            daa[12*20+ 9]       =       9.017954203     ;
+            daa[12*20+10]       =       6.746936485     ;
+            daa[12*20+11]       =       1.331291619     ;
+            daa[13*20+ 0]       =       0.080490909     ;
+            daa[13*20+ 1]       =       1.61E-02        ;
+            daa[13*20+ 2]       =       8.36E-04        ;
+            daa[13*20+ 3]       =       1.06E-06        ;
+            daa[13*20+ 4]       =       0.104053666     ;
+            daa[13*20+ 5]       =       0.032680657     ;
+            daa[13*20+ 6]       =       0.001003501     ;
+            daa[13*20+ 7]       =       0.001236645     ;
+            daa[13*20+ 8]       =       0.119028506     ;
+            daa[13*20+ 9]       =       1.463357278     ;
+            daa[13*20+10]       =       2.986800036     ;
+            daa[13*20+11]       =       3.20E-01        ;
+            daa[13*20+12]       =       0.279910509     ;
+            daa[14*20+ 0]       =       0.659311478     ;
+            daa[14*20+ 1]       =       0.15402718      ;
+            daa[14*20+ 2]       =       3.64E-02        ;
+            daa[14*20+ 3]       =       0.188539456     ;
+            daa[14*20+ 4]       =       1.59E-13        ;
+            daa[14*20+ 5]       =       0.712769599     ;
+            daa[14*20+ 6]       =       0.319558828     ;
+            daa[14*20+ 7]       =       0.038631761     ;
+            daa[14*20+ 8]       =       0.924466914     ;
+            daa[14*20+ 9]       =       0.080543327     ;
+            daa[14*20+10]       =       0.634308521     ;
+            daa[14*20+11]       =       0.195750632     ;
+            daa[14*20+12]       =       5.69E-02        ;
+            daa[14*20+13]       =       0.00713243      ;
+            daa[15*20+ 0]       =       3.011344519     ;
+            daa[15*20+ 1]       =       0.95013841      ;
+            daa[15*20+ 2]       =       3.881310531     ;
+            daa[15*20+ 3]       =       0.338372183     ;
+            daa[15*20+ 4]       =       0.336263345     ;
+            daa[15*20+ 5]       =       0.487822499     ;
+            daa[15*20+ 6]       =       0.307140298     ;
+            daa[15*20+ 7]       =       1.585646577     ;
+            daa[15*20+ 8]       =       0.58070425      ;
+            daa[15*20+ 9]       =       0.290381075     ;
+            daa[15*20+10]       =       0.570766693     ;
+            daa[15*20+11]       =       0.283807672     ;
+            daa[15*20+12]       =       0.007026588     ;
+            daa[15*20+13]       =       0.99668567      ;
+            daa[15*20+14]       =       2.087385344     ;
+            daa[16*20+ 0]       =       5.418298175     ;
+            daa[16*20+ 1]       =       0.183076905     ;
+            daa[16*20+ 2]       =       2.140332316     ;
+            daa[16*20+ 3]       =       0.135481233     ;
+            daa[16*20+ 4]       =       0.011975266     ;
+            daa[16*20+ 5]       =       0.602340963     ;
+            daa[16*20+ 6]       =       0.280124895     ;
+            daa[16*20+ 7]       =       0.01880803      ;
+            daa[16*20+ 8]       =       0.368713573     ;
+            daa[16*20+ 9]       =       2.904052286     ;
+            daa[16*20+10]       =       0.044926357     ;
+            daa[16*20+11]       =       1.5269642       ;
+            daa[16*20+12]       =       2.031511321     ;
+            daa[16*20+13]       =       0.000134906     ;
+            daa[16*20+14]       =       0.542251094     ;
+            daa[16*20+15]       =       2.206859934     ;
+            daa[17*20+ 0]       =       1.96E-01        ;
+            daa[17*20+ 1]       =       1.369429408     ;
+            daa[17*20+ 2]       =       5.36E-04        ;
+            daa[17*20+ 3]       =       1.49E-05        ;
+            daa[17*20+ 4]       =       0.09410668      ;
+            daa[17*20+ 5]       =       4.40E-02        ;
+            daa[17*20+ 6]       =       0.155245492     ;
+            daa[17*20+ 7]       =       0.196486447     ;
+            daa[17*20+ 8]       =       2.24E-02        ;
+            daa[17*20+ 9]       =       0.03213215      ;
+            daa[17*20+10]       =       0.431277663     ;
+            daa[17*20+11]       =       4.98E-05        ;
+            daa[17*20+12]       =       0.070460039     ;
+            daa[17*20+13]       =       0.814753094     ;
+            daa[17*20+14]       =       0.000431021     ;
+            daa[17*20+15]       =       0.099835753     ;
+            daa[17*20+16]       =       0.207066206     ;
+            daa[18*20+ 0]       =       0.018289288     ;
+            daa[18*20+ 1]       =       0.099855497     ;
+            daa[18*20+ 2]       =       0.373101927     ;
+            daa[18*20+ 3]       =       0.525398543     ;
+            daa[18*20+ 4]       =       0.601692431     ;
+            daa[18*20+ 5]       =       0.072205935     ;
+            daa[18*20+ 6]       =       0.10409287      ;
+            daa[18*20+ 7]       =       0.074814997     ;
+            daa[18*20+ 8]       =       6.448954446     ;
+            daa[18*20+ 9]       =       0.273934263     ;
+            daa[18*20+10]       =       0.340058468     ;
+            daa[18*20+11]       =       0.012416222     ;
+            daa[18*20+12]       =       0.874272175     ;
+            daa[18*20+13]       =       5.393924245     ;
+            daa[18*20+14]       =       1.82E-04        ;
+            daa[18*20+15]       =       0.39255224      ;
+            daa[18*20+16]       =       0.12489802      ;
+            daa[18*20+17]       =       0.42775543      ;
+            daa[19*20+ 0]       =       3.53200527      ;
+            daa[19*20+ 1]       =       0.103964386     ;
+            daa[19*20+ 2]       =       0.010257517     ;
+            daa[19*20+ 3]       =       0.297123975     ;
+            daa[19*20+ 4]       =       0.054904564     ;
+            daa[19*20+ 5]       =       0.406697814     ;
+            daa[19*20+ 6]       =       0.285047948     ;
+            daa[19*20+ 7]       =       0.337229619     ;
+            daa[19*20+ 8]       =       0.098631355     ;
+            daa[19*20+ 9]       =       14.39405219     ;
+            daa[19*20+10]       =       0.890598579     ;
+            daa[19*20+11]       =       0.07312793      ;
+            daa[19*20+12]       =       4.904842235     ;
+            daa[19*20+13]       =       0.592587985     ;
+            daa[19*20+14]       =       0.058971975     ;
+            daa[19*20+15]       =       0.088256423     ;
+            daa[19*20+16]       =       0.654109108     ;
+            daa[19*20+17]       =       0.256900461     ;
+            daa[19*20+18]       =       0.167581647     ;
+            
+ 
+  
+            f[0]        =       0.0471  ;
+            f[1]        =       0.0509  ;
+            f[2]        =       0.0742  ;
+            f[3]        =       0.0479  ;
+            f[4]        =       0.0250  ;
+            f[5]        =       0.0333  ;
+            f[6]        =       0.0546  ;
+            f[7]        =       0.0764  ;
+            f[8]        =       0.0200  ;
+            f[9]        =       0.0671  ;
+            f[10]       =       0.0715  ;
+            f[11]       =       0.0568  ;
+            f[12]       =       0.0181  ;
+            f[13]       =       0.0305  ;
+            f[14]       =       0.0507  ;
+            f[15]       =       0.0884  ;
+            f[16]       =       0.0743  ;
+            f[17]       =       0.0185  ;
+            f[18]       =       0.0315  ;
+            f[19]       =       0.0632  ;
+          }
+          break;     
+        default: 
+          assert(0);
+        }
+    }
+
+
+  /*
+    
+  TODO review frequency sums for fixed as well as empirical base frequencies !
+  
+  NUMERICAL BUG fix, rounded AA freqs in some models, such that 
+  they actually really sum to 1.0 +/- epsilon 
+  
+  {
+    double acc = 0.0;
+  
+    for(i = 0; i < 20; i++)
+      acc += f[i];
+    
+    printf("%1.80f\n", acc);
+    assert(acc == 1.0);  
+  }
+  */
+ 
+
+
+  /* fill the upper triangle (above the diagonal) with the corresponding values
+     from the lower triangle */
+  for (i=0; i<20; i++)  
+    for (j=0; j<i; j++)               
+      daa[j*20+i] = daa[i*20+j];
+
+  
+  /*
+    for (i=0; i<20; i++)  
+    {
+    for (j=0; j<20; j++)
+    {
+    if(i == j)
+    printf("0.0 ");
+    else
+    printf("%f ", daa[i * 20 + j]);
+    }
+    printf("\n");
+    }
+    
+    for (i=0; i<20; i++) 
+    printf("%f ", f[i]);
+    printf("\n");
+  */
+  
+
+  max = 0;
+  
+  /* copy the triangle above the diagonal from daa (which is a linear block) to
+     the triangle above the diagonal of a square matrix q. Store the maximal
+     value in variable max */
+  for(i = 0; i < 19; i++)
+    for(j = i + 1; j < 20; j++)
+      {
+        q[i][j] = temp = daa[i * 20 + j];
+        if(temp > max) 
+          max = temp;
+      }
+ 
+  scaler = PLL_AA_SCALE / max;
+   
+  /* SCALING HAS BEEN RE-INTRODUCED TO RESOLVE NUMERICAL  PROBLEMS */   
+
+  /* copy and scale values to the initialRates array */
+  r = 0;
+  for(i = 0; i < 19; i++)
+    {      
+      for(j = i + 1; j < 20; j++)
+        {  
+        
+          q[i][j] *= scaler;
+          
+          
+          assert(q[i][j] <= PLL_AA_SCALE_PLUS_EPSILON);
+          
+          initialRates[r++] = q[i][j];
+        }
+    }             
+}
+
+/** @brief Set the frac
+  *
+  * Update \a partitionContribution in each partition by setting it to the fraction of sites in
+  * that partition to the total number of sites. Also set \a tr->fracchange according to the
+  * computes \a fracchange of each partition.
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  * 
+  * @todo 
+      I understand how fracchange is computed for each partition, but I dont know
+      what is it for. Also what is tr->fracchange for?
+*/
+static void updateFracChange(pllInstance *tr, partitionList *pr)
+{   
+  int numberOfModels = pr->numberOfPartitions;
+  if(numberOfModels == 1)
+    {   
+      assert(pr->partitionData[0]->fracchange != -1.0);
+     
+      tr->fracchange = pr->partitionData[0]->fracchange;
+      pr->partitionData[0]->fracchange = -1.0;
+      pr->partitionData[0]->rawFracchange = pr->partitionData[0]->fracchange;
+    }      
+  else
+    {
+      int model;
+      double *modelWeights = (double *)rax_calloc((size_t)numberOfModels, sizeof(double));
+      double wgtsum = 0.0;  
+     
+      assert(numberOfModels > 1);
+
+      tr->fracchange = 0.0;              
+      
+       for(model = 0; model < numberOfModels; model++)
+         {
+           size_t
+             lower = pr->partitionData[model]->lower,
+             upper = pr->partitionData[model]->upper,
+             i;
+           
+           for(i = lower; i < upper; i++)
+             {
+               modelWeights[model] += (double)tr->aliaswgt[i];
+               wgtsum              += (double)tr->aliaswgt[i];
+             }
+         }
+
+       /*for(i = 0; i < tr->originalCrunchedLength; i++)
+        {
+          modelWeights[tr->model[i]]  += (double)tr->aliaswgt[i];
+          wgtsum                      += (double)tr->aliaswgt[i];
+          }*/  
+
+      
+                
+      for(model = 0; model < numberOfModels; model++)
+        {                        
+          pr->partitionData[model]->partitionContribution = modelWeights[model] / wgtsum;
+          tr->fracchange +=  pr->partitionData[model]->partitionContribution * pr->partitionData[model]->fracchange;
+          pr->partitionData[model]->rawFracchange = pr->partitionData[model]->fracchange;
+        }
+    
+      rax_free(modelWeights);
+    }
+
+    tr->rawFracchange = tr->fracchange;
+}
+
+/** @brief Not sure what this function does
+  * 
+  * @todo
+  *   Comment this function
+  */
+static void mytred2(double **a, const int n, double *d, double *e)
+{
+  int     l, k, j, i;
+  double  scale, hh, h, g, f; 
+ 
+  for (i = n; i > 1; i--)
+    {
+      l = i - 1;
+      h = 0.0;
+      scale = 0.0;
+      
+      if (l > 1)
+        {
+          for (k = 1; k <= l; k++)
+            scale += fabs(a[k - 1][i - 1]);
+          if (scale == 0.0)
+            e[i - 1] = a[l - 1][i - 1];
+          else
+            {
+              for (k = 1; k <= l; k++)
+                {
+                  a[k - 1][i - 1] /= scale;
+                  h += a[k - 1][i - 1] * a[k - 1][i - 1];
+                }
+              f = a[l - 1][i - 1];
+              g = ((f > 0) ? -sqrt(h) : sqrt(h)); /* diff */
+              e[i - 1] = scale * g;
+              h -= f * g;
+              a[l - 1][i - 1] = f - g;
+              f = 0.0;
+              for (j = 1; j <= l; j++)
+                {
+                  a[i - 1][j - 1] = a[j - 1][i - 1] / h;
+                  g = 0.0;
+                  for (k = 1; k <= j; k++)
+                    g += a[k - 1][j - 1] * a[k - 1][i - 1];
+                  for (k = j + 1; k <= l; k++)
+                    g += a[j - 1][k - 1] * a[k - 1][i - 1];
+                  e[j - 1] = g / h;
+                  f += e[j - 1] * a[j - 1][i - 1];
+                }
+              hh = f / (h + h);
+              for (j = 1; j <= l; j++)
+                {
+                  f = a[j - 1][i - 1];
+                  g = e[j - 1] - hh * f;
+                  e[j - 1] = g;
+                  for (k = 1; k <= j; k++)
+                    a[k - 1][j - 1] -= (f * e[k - 1] + g * a[k - 1][i - 1]);
+                }
+            }
+        } 
+      else
+        e[i - 1] = a[l - 1][i - 1];
+      d[i - 1] = h;
+    }
+  d[0] = 0.0;
+  e[0] = 0.0;
+  
+  for (i = 1; i <= n; i++)
+    {
+      l = i - 1;
+      if (d[i - 1] != 0.0)
+        {
+          for (j = 1; j <= l; j++)
+            {
+                g = 0.0;
+                for (k = 1; k <= l; k++)
+                  g += a[k - 1][i - 1] * a[j - 1][k - 1];
+                for(k = 1; k <= l; k++)
+                  a[j - 1][k - 1] -= g * a[i - 1][k - 1];
+            }
+        }
+      d[i - 1] = a[i - 1][i - 1];
+      a[i - 1][i - 1] = 1.0;
+      for (j = 1; j <= l; j++)
+        a[i - 1][j - 1] = a[j - 1][i - 1] = 0.0;
+    }
+ 
+ 
+}
+/*#define MYSIGN(a,b) ((b)<0 ? -fabs(a) : fabs(a))*/
+
+/** @brief Not sure what this function does
+  * 
+  * @todo
+  *   Comment this function
+  */
+static int mytqli(double *d, double *e, const int n, double **z)
+{
+  int     m, l, iter, i, k;
+  double  s, r, p, g, f, dd, c, b;
+   
+  for (i = 2; i <= n; i++)
+    e[i - 2] = e[i - 1];
+
+  e[n - 1] = 0.0;
+
+  for (l = 1; l <= n; l++)
+    {
+      iter = 0;
+      do
+        {
+          for (m = l; m <= n - 1; m++)
+            {
+              dd = fabs(d[m - 1]) + fabs(d[m]);
+              if (fabs(e[m - 1]) + dd == dd)
+                break;
+            }
+
+          if (m != l)
+           {
+             assert(iter < 30);
+             
+             g = (d[l] - d[l - 1]) / (2.0 * e[l - 1]);
+             r = sqrt((g * g) + 1.0);
+             g = d[m - 1] - d[l - 1] + e[l - 1] / (g + ((g < 0)?-fabs(r):fabs(r)));/*MYSIGN(r, g));*/
+             s = c = 1.0;
+             p = 0.0;
+
+             for (i = m - 1; i >= l; i--)
+               {
+                 f = s * e[i - 1];
+                 b = c * e[i - 1];
+                 if (fabs(f) >= fabs(g))
+                   {
+                     c = g / f;
+                     r = sqrt((c * c) + 1.0);
+                     e[i] = f * r;
+                     c *= (s = 1.0 / r);
+                   } 
+                 else
+                   {
+                     s = f / g;
+                     r = sqrt((s * s) + 1.0);
+                     e[i] = g * r;
+                     s *= (c = 1.0 / r);
+                   }
+                 g = d[i] - p;
+                 r = (d[i - 1] - g) * s + 2.0 * c * b;
+                 p = s * r;
+                 d[i] = g + p;
+                 g = c * r - b;
+                 for (k = 1; k <= n; k++)
+                   {
+                     f = z[i][k-1];
+                     z[i][k-1] = s * z[i - 1][k - 1] + c * f;
+                     z[i - 1][k - 1] = c * z[i - 1][k - 1] - s * f;
+                   }
+               }
+
+             d[l - 1] = d[l - 1] - p;
+             e[l - 1] = g;
+             e[m - 1] = 0.0;
+           }
+        } 
+      while (m != l);
+    }
+
+    
+ 
+    return (1);
+ }
+
+
+/** @brief Compute the eigenvectors and eigenvalues
+  *
+  * @param _a
+  *   The Q matrix
+  *
+  * @param states
+  *   Number of states
+  *
+  * @param d
+  *  Eigenvalues I think? 
+  * 
+  * @param e
+  *  Not sure why this is passed as a parameter. It is uninitialized, it is first set in mytqli(...) and it is never used in initGeneric()
+  *
+  * @todo
+  *   Remove e from parameter?
+*/
+static void makeEigen(double **_a, const int states, double *d, double *e)
+{
+  mytred2(_a, states, d, e);
+  mytqli(d, e, states, _a);
+}
+
+/** @brief Generic initialization of parameters and decomposition of the Q matrix
+  *
+  * Decompose the Q matrix into eigenvectors and eigenvalues. 
+  *
+  * @param states
+  *  Number of states of the current model
+  *
+  * @param valueVector
+  *  Pointer where the tipVector will be stored
+  *
+  * @param valueVectorLength
+  *  Number of elements (of size \a states) of the tipVector
+  *
+  * @param fracchange
+  *  Variable where the computed fracchange will be stored
+  *
+  * @param ext_EIGN
+  *   Array where the eigenvalues will be stored
+  *
+  * @param EV
+  *   Array where the eigenvectors will be stored
+  *  
+  * @param EI
+  *   Array where the inverse eigenvectors will be stored
+  *
+  * @param frequencies
+  *   The model frequencies
+  *
+  * @param ext_initialRates
+  *   The model substitution rates
+  *
+  * @param tipVector
+  *   Array where the computed tipVector will be stored
+  *
+  * @todo
+  *   Perhaps we could change this also to the way pllOptRatesGeneric and other functions are implemented.
+  *   That is, instead of passing all these parameters, pass the partition index instead and load the
+  *   values within the code. Will make the code more readable. 
+*/
+static void initGeneric(const int states, 
+                        const unsigned int *valueVector, 
+                        int valueVectorLength,
+                        double *fracchange,
+                        double *ext_EIGN,
+                        double *EV,
+                        double *EI,
+                        double *frequencies,
+                        double *ext_initialRates,
+                        double *tipVector
+                      )
+{
+  double 
+    **r, 
+    **a, 
+    **EIGV,
+    *initialRates = ext_initialRates, 
+    *f, 
+    *e, 
+    *d, 
+    *invfreq, 
+    *EIGN,
+    *eptr; 
+  
+  int 
+    i, 
+    j, 
+    k, 
+    m, 
+    l;  
+
+  r    = (double **)rax_malloc((size_t)states * sizeof(double *));
+  EIGV = (double **)rax_malloc((size_t)states * sizeof(double *));  
+  a    = (double **)rax_malloc((size_t)states * sizeof(double *));        
+  
+  for(i = 0; i < states; i++)
+    {
+      a[i]    = (double*)rax_malloc((size_t)states * sizeof(double));
+      EIGV[i] = (double*)rax_malloc((size_t)states * sizeof(double));
+      r[i]    = (double*)rax_malloc((size_t)states * sizeof(double));
+    }
+
+  f       = (double*)rax_malloc((size_t)states * sizeof(double));
+  e       = (double*)rax_malloc((size_t)states * sizeof(double));
+  d       = (double*)rax_malloc((size_t)states * sizeof(double));
+  invfreq = (double*)rax_malloc((size_t)states * sizeof(double));
+  EIGN    = (double*)rax_malloc((size_t)states * sizeof(double));
+  
+  for(l = 0; l < states; l++) 
+    f[l] = frequencies[l];      
+    
+  
+  i = 0;
+  
+  for(j = 0; j < states; j++)    
+    for(k = 0; k < states; k++)
+      r[j][k] = 0.0;
+  
+  for(j = 0; j < states - 1; j++)
+    for (k = j + 1; k < states; k++)              
+      r[j][k] = initialRates[i++];         
+  
+  for (j = 0; j < states; j++) 
+    {
+      r[j][j] = 0.0;
+      for (k = 0; k < j; k++)
+        r[j][k] = r[k][j];
+    }                         
+  
+  
+
+  *fracchange = 0.0;
+  
+  for (j = 0; j < states; j++)
+    for (k = 0; k < states; k++)
+      *fracchange += f[j] * r[j][k] * f[k];
+  
+  m = 0;
+  
+  for(i=0; i< states; i++) 
+    a[i][i] = 0;
+  
+  /*  assert(r[states - 2][states - 1] == 1.0);*/
+  
+  /* compute a matrix from the rates such that each element of the diagonal
+     equals to the negative sum of all other values in the current row */
+  for(i = 0; i < states; i++) 
+    {
+      for(j = i + 1;  j < states; j++) 
+        {
+          double factor =  initialRates[m++];
+          a[i][j] = a[j][i] = factor * sqrt( f[i] * f[j]);
+          a[i][i] -= factor * f[j];
+          a[j][j] -= factor * f[i];
+        }
+    }                           
+
+  makeEigen(a, states, d, e);
+  
+ 
+  
+  for (i = 0; i < states; i++)     
+    for (j = 0; j < states; j++)       
+      a[i][j] *= sqrt(f[j]);
+   
+  
+  
+  for (i = 0; i < states; i++)
+    {     
+      if (d[i] > -1e-8) 
+        {             
+          if (i != 0) 
+            {               
+              double tmp = d[i], sum=0;
+              d[i] = d[0];
+              d[0] = tmp;
+              for (j=0; j < states; j++) 
+                {
+                  tmp = a[i][j];
+                  a[i][j] = a[0][j];
+                  sum += (a[0][j] = tmp);
+                }
+              for (j=0; j < states; j++) 
+                a[0][j] /= sum;
+            }
+          break;
+        }
+    }
+  
+  for (i = 0; i < states; i++) 
+    {
+      EIGN[i] = -d[i];
+      
+      for (j=0; j < states; j++)
+        EIGV[i][j] = a[j][i];
+      invfreq[i] = 1 / EIGV[i][0]; 
+    }                                    
+  
+  ext_EIGN[0] = 0.0;
+
+  for (l = 1; l < states; l++)
+    {
+      ext_EIGN[l] = EIGN[l]; 
+      assert(ext_EIGN[l] > 0.0);
+    }
+  
+  eptr = EV;
+  
+  for (i = 0; i < states; i++)            
+    for (j = 0; j < states; j++)
+      {
+        *eptr++ = EIGV[i][j];    /* EIGV: Eigenvalues */ 
+        
+      }
+  for (i = 0; i < states; i++)
+    for (j = 0; j < states; j++)
+      {
+        if(j == 0)
+          EI[i * states + j] = 1.0;
+        else
+          EI[i * states + j] = EV[i * states + j] * invfreq[i];   /* EV = Eigenvector, EI = Inverse Eigenvector,   $ u_{i,x}^{-1} = \pi_x u_{x,i} */
+      }
+  
+  for (i = 0; i < valueVectorLength; i++)
+    {
+      unsigned int value = valueVector[i];
+      
+      for(j = 0; j < states; j++)
+        tipVector[i * states + j]     = 0;                  
+
+      if(value > 0)
+        {                     
+          for (j = 0; j < states; j++) 
+            {       
+              if ((value >> j) & 1) 
+                {
+                  int l;
+                  for (l = 0; l < states; l++)
+                    tipVector[i * states + l] += EIGV[j][l];
+                }                         
+            }       
+        }     
+    }
+
+  for (i = 0; i < valueVectorLength; i++)
+    {
+       for(j = 0; j < states; j++)
+         if(tipVector[i * states + j] > PLL_MAX_TIP_EV)
+           tipVector[i * states + j] = PLL_MAX_TIP_EV;
+    }
+
+
+  
+
+  for (i = 0; i < states; i++)
+    {
+      rax_free(EIGV[i]);
+      rax_free(a[i]);
+      rax_free(r[i]);
+    }
+
+  rax_free(r);
+  rax_free(a);
+  rax_free(EIGV);
+
+  rax_free(f);
+  rax_free(e);
+  rax_free(d);
+  rax_free(invfreq);
+  rax_free(EIGN);
+}
+
+/** @brief Initialize GTR
+  *
+  * Wrapper function for the decomposition of the substitution rates matrix
+  * into eigenvectors and eigenvalues
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @param model
+  *   Partition index
+  */
+void pllInitReversibleGTR(pllInstance * tr, partitionList * pr, int model)
+{ 
+ double   
+   *ext_EIGN         = pr->partitionData[model]->EIGN,
+   *EV               = pr->partitionData[model]->EV,
+   *EI               = pr->partitionData[model]->EI,
+   *frequencies      = pr->partitionData[model]->frequencies,
+   *empiricalFrequencies = pr->partitionData[model]->empiricalFrequencies,
+   *ext_initialRates = pr->partitionData[model]->substRates,
+   *tipVector        = pr->partitionData[model]->tipVector,
+   *fracchange       = &(pr->partitionData[model]->fracchange);
+ 
+  
+ int states = pr->partitionData[model]->states;
+
+ switch(pr->partitionData[model]->dataType)
+   { 
+   case PLL_GENERIC_32:
+   case PLL_GENERIC_64:
+   case PLL_SECONDARY_DATA_6:
+   case PLL_SECONDARY_DATA_7: 
+   case PLL_SECONDARY_DATA:
+   case PLL_DNA_DATA:
+   case PLL_BINARY_DATA:    
+     initGeneric(states, 
+                 getBitVector(pr->partitionData[model]->dataType),
+                 getUndetermined(pr->partitionData[model]->dataType) + 1,
+                 fracchange,
+                 ext_EIGN, 
+                 EV, 
+                 EI, 
+                 frequencies, 
+                 ext_initialRates,
+                 tipVector
+                 // model
+                );
+     break;   
+   case PLL_AA_DATA:
+     if(pr->partitionData[model]->protModels != PLL_GTR)
+       {
+         double f[20];
+         int l;
+
+         if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+           {
+             int 
+               i;
+             
+             for(i = 0; i < 4; i++)
+               {                 
+                 initProtMat(f, pr->partitionData[model]->protModels, &(pr->partitionData[model]->substRates_LG4[i][0]), i);
+                 
+                 if(!pr->partitionData[model]->optimizeBaseFrequencies)
+                 {
+                   if(!pr->partitionData[model]->protUseEmpiricalFreqs)
+                   {
+                     for(l = 0; l < 20; l++)            
+                       pr->partitionData[model]->frequencies_LG4[i][l] = f[l];
+                   }
+                   else
+                   {
+                     for(l = 0; l < 20; l++)            
+                       pr->partitionData[model]->frequencies_LG4[i][l] = empiricalFrequencies[l];
+                   }
+                 }
+                 else
+                 {
+                   memcpy(pr->partitionData[model]->frequencies_LG4[i], frequencies, 20 * sizeof(double));
+                 }
+               }
+           }
+         else
+           {
+             if(pr->partitionData[model]->protModels == PLL_AUTO)
+               initProtMat(f, pr->partitionData[model]->autoProtModels, ext_initialRates, 0);
+             else         
+               {
+                 initProtMat(f, pr->partitionData[model]->protModels, ext_initialRates, 0);
+               }
+
+             /*if(adef->protEmpiricalFreqs && tr->NumberOfModels == 1)
+               assert(tr->partitionData[model].protUseEmpiricalFreqs);*/
+         
+              if (!pr->partitionData[model]->optimizeBaseFrequencies) {
+                  if(!pr->partitionData[model]->protUseEmpiricalFreqs)
+                  {                 
+                      for(l = 0; l < 20; l++)           
+                         frequencies[l] = f[l];
+                  } else {
+                      for(l = 0; l < 20; l++)           
+                         frequencies[l] = empiricalFrequencies[l];
+                  }
+              }
+           }  
+       }
+               
+     if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+       {
+         int 
+           i;
+
+         double 
+           *fracchanges_LG4[4],
+           acc = 0.0;
+
+         /* TODO frac change !*/
+
+         for(i = 0; i < 4; i++)
+           {
+             fracchanges_LG4[i]  = (double *)rax_malloc(pr->numberOfPartitions * sizeof(double));
+             initGeneric(states, 
+                         bitVectorAA, 
+                         23, 
+                         fracchanges_LG4[i],
+                         pr->partitionData[model]->EIGN_LG4[i], 
+                         pr->partitionData[model]->EV_LG4[i],
+                         pr->partitionData[model]->EI_LG4[i],
+                         pr->partitionData[model]->frequencies_LG4[i],
+                         pr->partitionData[model]->substRates_LG4[i],
+                         pr->partitionData[model]->tipVector_LG4[i]
+             //            model
+                        );   
+           }
+
+         for(i = 0; i < 4; i++)
+           {        
+             acc += fracchanges_LG4[i][model];
+             rax_free(fracchanges_LG4[i]);
+           }
+
+         //tr->fracchanges[model] = acc / 4;
+         //TODO check if valid
+         fracchange[model] = acc / 4;
+       }
+     else
+       initGeneric(states, 
+                   bitVectorAA, 
+                   23, 
+                   fracchange,
+                   ext_EIGN, 
+                   EV, 
+                   EI, 
+                   frequencies, 
+                   ext_initialRates,
+                   tipVector
+       //            model
+                  );
+    break;  
+   default:
+     assert(0);
+   } 
+
+ updateFracChange(tr, pr);
+}
+
+
+double LnGamma (double alpha)
+{
+/* returns ln(gamma(alpha)) for alpha>0, accurate to 10 decimal places.  
+   Stirling's formula is used for the central polynomial part of the procedure.
+   Pike MC & Hill ID (1966) Algorithm 291: Logarithm of the gamma function.
+   Communications of the Association for Computing Machinery, 9:684
+*/
+  double x, f, z, result;
+
+  x = alpha;
+  f = 0.0;
+  
+  if ( x < 7.0) 
+     {
+       f = 1.0;  
+       z = alpha - 1.0;
+      
+       while ((z = z + 1.0) < 7.0)  
+         {        
+           f *= z;
+         }
+       x = z;   
+     
+       assert(f != 0.0);
+        
+       f=-log(f);
+     }
+   
+   z = 1/(x*x);
+   
+   result = f + (x-0.5)*log(x) - x + .918938533204673 
+          + (((-.000595238095238*z+.000793650793651)*z-.002777777777778)*z
+               +.083333333333333)/x;  
+
+   return result;
+}
+
+
+
+double IncompleteGamma (double x, double alpha, double ln_gamma_alpha)
+{
+/* returns the incomplete gamma ratio I(x,alpha) where x is the upper 
+           limit of the integration and alpha is the shape parameter.
+   returns (-1) if in error
+   ln_gamma_alpha = ln(Gamma(alpha)), is almost redundant.
+   (1) series expansion     if (alpha>x || x<=1)
+   (2) continued fraction   otherwise
+   RATNEST FORTRAN by
+   Bhattacharjee GP (1970) The incomplete gamma integral.  Applied Statistics,
+   19: 285-287 (AS32)
+*/
+   int i;
+   double p=alpha, g=ln_gamma_alpha;
+   double accurate=1e-8, overflow=1e30;
+   double factor, gin=0, rn=0, a=0,b=0,an=0,dif=0, term=0, pn[6];
+
+
+   if (x==0) return (0);
+   if (x<0 || p<=0) return (-1);
+
+   
+   factor=exp(p*log(x)-x-g);   
+   if (x>1 && x>=p) goto l30;
+   /* (1) series expansion */
+   gin=1;  term=1;  rn=p;
+ l20:
+   rn++;
+   term*=x/rn;   gin+=term;
+
+   if (term > accurate) goto l20;
+   gin*=factor/p;
+   goto l50;
+ l30:  
+   /* (2) continued fraction */
+   a=1-p;   b=a+x+1;  term=0;
+   pn[0]=1;  pn[1]=x;  pn[2]=x+1;  pn[3]=x*b;
+   gin=pn[2]/pn[3];   
+ l32:  
+   a++;  
+   b+=2;  
+   term++;   
+   an=a*term;
+   for (i=0; i<2; i++) 
+     pn[i+4]=b*pn[i+2]-an*pn[i];
+   if (pn[5] == 0) goto l35;
+   rn=pn[4]/pn[5];   
+   dif=fabs(gin-rn);  
+   if (dif>accurate) goto l34;
+   if (dif<=accurate*rn) goto l42;
+ l34:   
+   gin=rn;
+ l35: 
+   for (i=0; i<4; i++) 
+     pn[i]=pn[i+2];
+   if (fabs(pn[4]) < overflow)            
+     goto l32;        
+   
+   for (i=0; i<4; i++) 
+     pn[i]/=overflow;
+
+   
+   goto l32;
+ l42:  
+   gin=1-factor*gin;
+
+ l50: 
+   return (gin);
+}
+
+
+
+
+double PointNormal (double prob)
+{
+/* returns z so that Prob{x<z}=prob where x ~ N(0,1) and (1e-12)<prob<1-(1e-12)
+   returns (-9999) if in error
+   Odeh RE & Evans JO (1974) The percentage points of the normal distribution.
+   Applied Statistics 22: 96-97 (AS70)
+
+   Newer methods:
+     Wichura MJ (1988) Algorithm AS 241: the percentage points of the
+       normal distribution.  37: 477-484.
+     Beasley JD & Springer SG  (1977).  Algorithm AS 111: the percentage 
+       points of the normal distribution.  26: 118-121.
+
+*/
+   double a0=-.322232431088, a1=-1, a2=-.342242088547, a3=-.0204231210245;
+   double a4=-.453642210148e-4, b0=.0993484626060, b1=.588581570495;
+   double b2=.531103462366, b3=.103537752850, b4=.0038560700634;
+   double y, z=0, p=prob, p1;
+
+   p1 = (p<0.5 ? p : 1-p);
+   if (p1<1e-20) return (-9999);
+
+   y = sqrt (log(1/(p1*p1)));   
+   z = y + ((((y*a4+a3)*y+a2)*y+a1)*y+a0) / ((((y*b4+b3)*y+b2)*y+b1)*y+b0);
+   return (p<0.5 ? -z : z);
+}
+
+
+double PointChi2 (double prob, double v)
+{
+/* returns z so that Prob{x<z}=prob where x is Chi2 distributed with df=v
+   returns -1 if in error.   0.000002<prob<0.999998
+   RATNEST FORTRAN by
+       Best DJ & Roberts DE (1975) The percentage points of the 
+       Chi2 distribution.  Applied Statistics 24: 385-388.  (AS91)
+   Converted into C by Ziheng Yang, Oct. 1993.
+*/
+   double e=.5e-6, aa=.6931471805, p=prob, g;
+   double xx, c, ch, a=0,q=0,p1=0,p2=0,t=0,x=0,b=0,s1,s2,s3,s4,s5,s6;
+  
+   if (p<.000002 || p>.999998 || v<=0) return (-1);
+  
+   g = LnGamma(v/2);
+   
+   xx=v/2;   c=xx-1;
+   if (v >= -1.24*log(p)) goto l1;
+
+   ch=pow((p*xx*exp(g+xx*aa)), 1/xx);
+   if (ch-e<0) return (ch);
+   goto l4;
+l1:
+   if (v>.32) goto l3;
+   ch=0.4;   a=log(1-p);
+l2:
+   q=ch;  p1=1+ch*(4.67+ch);  p2=ch*(6.73+ch*(6.66+ch));
+   t=-0.5+(4.67+2*ch)/p1 - (6.73+ch*(13.32+3*ch))/p2;
+   ch-=(1-exp(a+g+.5*ch+c*aa)*p2/p1)/t;
+   if (fabs(q/ch-1)-.01 <= 0) goto l4;
+   else                       goto l2;
+  
+l3:    
+   x=PointNormal (p);
+   p1=0.222222/v;   ch=v*pow((x*sqrt(p1)+1-p1), 3.0);
+   if (ch>2.2*v+6)  ch=-2*(log(1-p)-c*log(.5*ch)+g);
+l4:
+   q=ch;   p1=.5*ch;   
+   if ((t=IncompleteGamma (p1, xx, g))< 0.0) 
+     {
+       printf ("IncompleteGamma \n");      
+       return (-1);
+     }
+  
+   p2=p-t;
+   t=p2*exp(xx*aa+g+p1-c*log(ch));   
+   b=t/ch;  a=0.5*t-b*c;
+
+   s1=(210+a*(140+a*(105+a*(84+a*(70+60*a))))) / 420;
+   s2=(420+a*(735+a*(966+a*(1141+1278*a))))/2520;
+   s3=(210+a*(462+a*(707+932*a)))/2520;
+   s4=(252+a*(672+1182*a)+c*(294+a*(889+1740*a)))/5040;
+   s5=(84+264*a+c*(175+606*a))/2520;
+   s6=(120+c*(346+127*c))/5040;
+   ch+=t*(1+0.5*t*s1-b*c*(s1-b*(s2-b*(s3-b*(s4-b*(s5-b*s6))))));
+   if (fabs(q/ch-1) > e) goto l4;
+
+   return (ch);
+}
+
+/** @brief Compute the gamma rates
+    
+    Compute the gamma rates
+
+    @param alpha
+      Alpha parameter
+
+    @param gammaRates
+      Array where to store the computed gamma rates
+
+    @param K
+      Number of categories
+
+    @param useMedian
+      Boolean flag whether to use a median or not
+
+    @todo
+       Document this more.
+*/
+void pllMakeGammaCats(double alpha, double *gammaRates, int K, pllBoolean useMedian)
+{
+  int 
+    i;
+
+  double 
+    factor = alpha / alpha * K, 
+    lnga1, 
+    alfa = alpha, 
+    beta = alpha,
+    *gammaProbs = (double *)rax_malloc(K * sizeof(double));
+
+  /* Note that PLL_ALPHA_MIN setting is somewhat critical due to   */
+  /* numerical instability caused by very small rate[0] values */
+  /* induced by low alpha values around 0.01 */
+
+  assert(alfa >= PLL_ALPHA_MIN); 
+
+  if(useMedian)
+    {
+      double  
+        middle = 1.0 / (2.0*K),
+        t = 0.0; 
+      
+      for(i = 0; i < K; i++)     
+        gammaRates[i] = PLL_POINT_GAMMA((double)(i * 2 + 1) * middle, alfa, beta);
+      
+      for (i = 0; i < K; i++) 
+        t += gammaRates[i];
+       for( i = 0; i < K; i++)     
+         gammaRates[i] *= factor / t;
+    }
+  else
+    {
+      lnga1 = LnGamma(alfa + 1);
+
+      for (i = 0; i < K - 1; i++)
+        gammaProbs[i] = PLL_POINT_GAMMA((i + 1.0) / K, alfa, beta);
+
+      for (i = 0; i < K - 1; i++)
+        gammaProbs[i] = IncompleteGamma(gammaProbs[i] * beta, alfa + 1, lnga1);   
+
+      gammaRates[0] = gammaProbs[0] * factor;
+      
+      gammaRates[K - 1] = (1 - gammaProbs[K - 2]) * factor;
+
+      for (i= 1; i < K - 1; i++)  
+        gammaRates[i] = (gammaProbs[i] - gammaProbs[i - 1]) * factor;      
+    }
+  /* assert(gammaRates[0] >= 0.00000000000000000000000000000044136090435925743185910935350715027016962154188875); */
+
+  rax_free(gammaProbs);
+
+  return;  
+}
+
+
+/** @brief Set the substitution rates
+  *
+  * @brief Set \a rates - 1  substitution rates. Set the last rate to 1.
+  *
+  * @param r
+  *  Array of substitution rates
+  *
+  * @param rates
+  *   Number of rates to set
+  */
+static void setRates(double *r, int rates)
+{
+  int i;
+
+  //changed to 1.0 instead of 0.5 for making the 
+  //implementation of an interface function to set other models 
+  //than GTR easier 
+
+  for(i = 0; i < rates - 1; i++)
+    r[i] = 1.0;
+
+  r[rates - 1] = 1.0;
+}
+
+/** @brief Initialize the substitution rates matrix
+  *
+  * Initialize the substitution rates matrices for all partitions
+  *
+  * @param tr
+  *   The PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @todo
+  *   Do we need the secondary structure and binary? Will we only use GTR? If yes,
+  *   we could rename this function to initRateMatrixGTR
+  */
+void initRateMatrix(pllInstance *tr, partitionList *pr)
+{
+  int model;
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {   
+      int       
+        i,
+        states = pr->partitionData[model]->states,
+        rates  = (states * states - states) / 2;
+      
+      switch(pr->partitionData[model]->dataType)
+        {
+        case PLL_BINARY_DATA:
+        case PLL_DNA_DATA:
+        case PLL_SECONDARY_DATA:
+        case PLL_SECONDARY_DATA_6:
+        case PLL_SECONDARY_DATA_7:
+          setRates(pr->partitionData[model]->substRates, rates);
+          break;          
+        case PLL_GENERIC_32:
+        case PLL_GENERIC_64:      
+          switch(tr->multiStateModel)
+            {
+            case PLL_ORDERED_MULTI_STATE:
+              {
+                int 
+                  j, 
+                  k, 
+                  i = 0;
+                
+                for(j = 0; j < states; j++)
+                  for(k = j + 1; k < states; k++)
+                    pr->partitionData[model]->substRates[i++] = (double)(k - j);
+                assert(i == rates);             
+              }
+              break;
+            case PLL_MK_MULTI_STATE:
+              for(i = 0; i < rates; i++)
+                pr->partitionData[model]->substRates[i] = 1.0;
+              
+              break;
+            case PLL_GTR_MULTI_STATE:
+              setRates(pr->partitionData[model]->substRates, rates);
+              break;
+            default:
+              assert(0);
+            }
+          break;
+        case PLL_AA_DATA:
+          if(pr->partitionData[model]->protModels == PLL_GTR)
+            {
+              //set optimizeSubstRates to true !
+              pr->partitionData[model]->optimizeSubstitutionRates = PLL_TRUE;
+              putWAG(pr->partitionData[model]->substRates);
+            }
+          break;
+        default:
+          assert(0);
+        }           
+      
+      if(pr->partitionData[model]->nonGTR)
+        {
+          assert(pr->partitionData[model]->dataType == PLL_SECONDARY_DATA ||
+                 pr->partitionData[model]->dataType == PLL_SECONDARY_DATA_6 ||
+                 pr->partitionData[model]->dataType == PLL_SECONDARY_DATA_7);
+                  
+          for(i = 0; i < rates; i++)
+            {
+              if(pr->partitionData[model]->symmetryVector[i] == -1)
+                pr->partitionData[model]->substRates[i] = 0.0;
+              else
+                {
+                  if(pr->partitionData[model]->symmetryVector[i] == pr->partitionData[model]->symmetryVector[rates - 1])
+                    pr->partitionData[model]->substRates[i] = 1.0;
+                }
+            }
+        }
+    }  
+}
+
+/** @brief Function for setting secondary structure symmetries
+  *
+  * @todo
+  *   Do we need this function?
+*/
+static void setSymmetry(int *s, int *sDest, const int sCount, int *f, int *fDest, const int fCount)
+{
+  int i;
+
+  for(i = 0; i < sCount; i++)
+    sDest[i] = s[i];
+
+  for(i = 0; i < fCount; i++)
+    fDest[i] = f[i];
+}
+
+/** @brief Wrapper function for setting secondary structure symmetries
+  *
+  * @todo
+  *   Do we need this function?
+*/
+static void setupSecondaryStructureSymmetries(pllInstance *tr, partitionList *partitions)
+{
+  int model;
+  int numberOfModels = partitions->numberOfPartitions;
+
+  for(model = 0; model < numberOfModels; model++)
+    {
+      if(partitions->partitionData[model]->dataType == PLL_SECONDARY_DATA ||
+                  partitions->partitionData[model]->dataType == PLL_SECONDARY_DATA_6 ||
+                  partitions->partitionData[model]->dataType == PLL_SECONDARY_DATA_7)
+        {       
+          switch(tr->secondaryStructureModel)
+            {
+            case PLL_SEC_6_A:
+                partitions->partitionData[model]->nonGTR = PLL_FALSE;
+              break;
+            case PLL_SEC_6_B:
+              {
+                int f[6]  = {0, 1, 2, 3, 4, 5};
+                int s[15] = {2, 0, 1, 2, 2, 2, 2, 0, 1, 1, 2, 2, 2, 2, 1};
+
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 15, f, partitions->partitionData[model]->frequencyGrouping, 6);
+                  
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+              }
+              break;
+            case PLL_SEC_6_C:
+              {
+                int f[6]  = {0, 2, 2, 1, 0, 1};
+                int s[15] = {2, 0, 1, 2, 2, 2, 2, 0, 1, 1, 2, 2, 2, 2, 1};
+
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 15, f, partitions->partitionData[model]->frequencyGrouping, 6);
+                
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+              }
+              break;
+            case PLL_SEC_6_D:
+              {
+                int f[6]  = {0, 2, 2, 1, 0, 1};
+                int s[15] = {2, -1, 1, 2, 2, 2, 2, -1, 1, 1, 2, 2, 2, 2, 1};
+
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 15, f, partitions->partitionData[model]->frequencyGrouping, 6);
+
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+              }
+              break;
+            case PLL_SEC_6_E:
+              {
+                int f[6]  = {0, 1, 2, 3, 4, 5};
+                int s[15] = {2, -1, 1, 2, 2, 2, 2, -1, 1, 1, 2, 2, 2, 2, 1};
+
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 15, f, partitions->partitionData[model]->frequencyGrouping, 6);
+
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+              }
+              break;
+            case PLL_SEC_7_A:
+                partitions->partitionData[model]->nonGTR = PLL_FALSE;
+              break;
+            case PLL_SEC_7_B:
+              {
+                int f[7]  = {0, 2, 2, 1, 0, 1, 3};
+                int s[21] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
+                
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 21, f, partitions->partitionData[model]->frequencyGrouping, 7);
+
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+
+              }
+              break;
+            case PLL_SEC_7_C:
+              {
+                int f[7]  = {0, 1, 2, 3, 4, 5, 6};
+                int s[21] = {-1, -1, 0, -1, -1, 4, -1, -1, -1, 3, 5, 1, -1, -1, 6, -1, -1, 7, 2, 8, 9};
+                
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 21, f, partitions->partitionData[model]->frequencyGrouping, 7);
+
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+
+              }
+              break;
+            case PLL_SEC_7_D:
+              {
+                int f[7]  = {0, 1, 2, 3, 4, 5, 6};
+                int s[21] = {2, 0, 1, 2, 2, 3, 2, 2, 0, 1, 3, 1, 2, 2, 3, 2, 2, 3, 1, 3, 3};
+                
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 21, f, partitions->partitionData[model]->frequencyGrouping, 7);
+
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+
+              }
+              break;
+            case PLL_SEC_7_E:
+              {
+                int f[7]  = {0, 1, 2, 3, 4, 5, 6};
+                int s[21] = {-1, -1, 0, -1, -1, 1, -1, -1, -1, 0, 1, 0, -1, -1, 1, -1, -1, 1, 0, 1, 1};
+                
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 21, f, partitions->partitionData[model]->frequencyGrouping, 7);
+
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+
+              }
+              break;
+            case PLL_SEC_7_F:
+              {
+                int f[7]  = {0, 2, 2, 1, 0, 1, 3};
+                int s[21] = {2, 0, 1, 2, 2, 3, 2, 2, 0, 1, 3, 1, 2, 2, 3, 2, 2, 3, 1, 3, 3};            
+                
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 21, f, partitions->partitionData[model]->frequencyGrouping, 7);
+
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+
+              }
+              break;
+              
+            case PLL_SEC_16:
+                partitions->partitionData[1]->nonGTR = PLL_FALSE;
+              break;
+            case PLL_SEC_16_A:
+              {
+                int f[16]  = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+                int s[120] = {/* AA */  4,  4,  3,  4, -1, -1, -1,  4, -1, -1, -1,  3, -1, -1, -1,
+                              /* AC */  4,  3, -1,  4, -1, -1, -1,  3, -1, -1, -1,  4, -1, -1,
+                              /* AG */  3, -1, -1,  3, -1, -1, -1,  4, -1, -1, -1,  3, -1,
+                              /* AU */ -1, -1,  2,  3, -1,  0, -1,  1,  2, -1,  2,  3,
+                              /* CA */  4,  3,  4,  4, -1, -1, -1,  3, -1, -1, -1,
+                              /* CC */  3,  4, -1,  3, -1, -1, -1,  4, -1, -1,
+                              /* CG */  3, -1,  2,  3,  2,  0, -1,  1, -1,
+                              /* CU */ -1, -1, -1,  3, -1, -1, -1,  4,
+                              /* GA */  3,  4,  3,  3, -1, -1, -1,
+                              /* GC */  3,  1,  2,  3,  2, -1,
+                              /* GG */  3, -1, -1,  3, -1,
+                              /* GU */  2, -1,  2,  3,
+                              /* UA */  3,  1,  3,
+                              /* UC */  3,  4,
+                              /* UG */  3};
+                              
+                
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 120, f, partitions->partitionData[model]->frequencyGrouping, 16);
+                              
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+
+                }
+              break;
+            case PLL_SEC_16_B:
+              {
+                int f[16]  = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+                int s[120] = {/* AA */  0,  0,  0,  0, -1, -1, -1,  0, -1, -1, -1,  0, -1, -1, -1,
+                              /* AC */  0,  0, -1,  0, -1, -1, -1,  0, -1, -1, -1,  0, -1, -1,
+                              /* AG */  0, -1, -1,  0, -1, -1, -1,  0, -1, -1, -1,  0, -1,
+                              /* AU */ -1, -1,  0,  0, -1,  0, -1,  0,  0, -1,  0,  0,
+                              /* CA */  0,  0,  0,  0, -1, -1, -1,  0, -1, -1, -1,
+                              /* CC */  0,  0, -1,  0, -1, -1, -1,  0, -1, -1,
+                              /* CG */  0, -1,  0,  0,  0,  0, -1,  0, -1,
+                              /* CU */ -1, -1, -1,  0, -1, -1, -1,  0,
+                              /* GA */  0,  0,  0,  0, -1, -1, -1,
+                              /* GC */  0,  0,  0,  0,  0, -1,
+                              /* GG */  0, -1, -1,  0, -1,
+                              /* GU */  0, -1,  0,  0,
+                              /* UA */  0,  0,  0,
+                              /* UC */  0,  0,
+                              /* UG */  0};
+                              
+                
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 120, f, partitions->partitionData[model]->frequencyGrouping, 16);
+                              
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+              }
+              break;
+            case PLL_SEC_16_C:        
+            case PLL_SEC_16_D:
+            case PLL_SEC_16_E:
+            case PLL_SEC_16_F:
+            case PLL_SEC_16_I:
+            case PLL_SEC_16_J:
+            case PLL_SEC_16_K:
+              assert(0);
+            default:
+              assert(0);
+            }
+        }
+
+    }
+
+}
+
+/** @brief Initialize base frequencies in partition data
+  *
+  * Copy the computed empirical frequencies for each partition from the \a empiricalFrequencies
+  * structure to each partition structure.
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @param empiricalFrequencies
+  *   Array containing the empirical frequencies
+*/
+static void initializeBaseFreqs(partitionList *pr, double **empiricalFrequencies)
+{
+  size_t 
+    model;
+  int
+    l,
+    numFreqs;
+  double f;
+
+  for(model = 0; model < (size_t)pr->numberOfPartitions; model++)
+    {
+      if(pr->partitionData[model]->optimizeBaseFrequencies)
+       {
+         //set all base frequencies to identical starting values 1.0 / numberOfDataStates
+         numFreqs = pr->partitionData[model]->states;
+         f = 1.0 / ((double)numFreqs);
+
+         for(l = 0; l < numFreqs; l++)
+          {
+            pr->partitionData[model]->frequencies[l]          = f;
+            pr->partitionData[model]->empiricalFrequencies[l] = f;
+          }
+       }
+      else
+       {
+         memcpy(pr->partitionData[model]->frequencies,          empiricalFrequencies[model], sizeof(double) * pr->partitionData[model]->states);
+         memcpy(pr->partitionData[model]->empiricalFrequencies, empiricalFrequencies[model], sizeof(double) * pr->partitionData[model]->states);
+       }
+    }
+}
+
+
+/** @brief Initialize the model parameters
+  * 
+  * Initialize the model parameters. Specifically
+  *   - Base frequencies
+  *   - Rate matrix
+  *
+  * @param tr
+  *   The PLL instance
+  *
+  * @param empiricalFrequencies
+  *   Pointer to the empirical frequencies array
+  *
+  * @param partitions
+  *   Pointer to the partitions structure
+  *
+  * @todo
+  *   What is tr->optimizeRateCategoryInvocations = 1 ?
+  */
+void initModel(pllInstance *tr, double **empiricalFrequencies, partitionList * partitions)
+{  
+  int model, j;
+  double  temp;  
+     
+  tr->optimizeRateCategoryInvocations = 1;      
+  tr->numberOfInvariableColumns = 0;
+  tr->weightOfInvariableColumns = 0;           
+  
+  for (j = 0; j < tr->originalCrunchedLength; j++) 
+    {
+      tr->patrat[j] = temp = 1.0;
+      tr->patratStored[j] = 1.0;
+      tr->rateCategory[j] = 0;           
+    } 
+
+  /* PSR (CAT) model init */
+  for(model = 0; model < partitions->numberOfPartitions; model++)
+    {            
+          partitions->partitionData[model]->numberOfCategories = 1;
+          partitions->partitionData[model]->perSiteRates[0] = 1.0;
+    }
+    
+  updatePerSiteRates(tr, partitions, PLL_FALSE);
+ 
+  setupSecondaryStructureSymmetries(tr, partitions);
+  
+  initRateMatrix(tr, partitions);
+
+  initializeBaseFreqs(partitions, empiricalFrequencies);
+  
+  for(model = 0; model < partitions->numberOfPartitions; model++)
+   {
+     int
+       k;
+
+     partitions->partitionData[model]->alpha = 1.0;
+     if(partitions->partitionData[model]->dataType == PLL_AA_DATA && partitions->partitionData[model]->protModels == PLL_AUTO)
+       partitions->partitionData[model]->autoProtModels = PLL_WAG; /* initialize by WAG per default */
+      
+     pllInitReversibleGTR(tr, partitions, model); /* Decomposition of Q matrix */
+      /* GAMMA model init */
+     pllMakeGammaCats(partitions->partitionData[model]->alpha, partitions->partitionData[model]->gammaRates, 4, tr->useMedian);
+
+     for(k = 0; k < partitions->partitionData[model]->states; k++)
+       partitions->partitionData[model]->freqExponents[k] = 0.0;
+
+     for(k = 0; k < 4; k++)
+     {
+	   partitions->partitionData[model]->lg4x_weights[k] = 0.25;
+	   partitions->partitionData[model]->lg4x_weightExponents[k] = 0.0;
+     }
+
+   }                                   
+  
+  if(partitions->numberOfPartitions > 1)
+    {
+      tr->fracchange = 0;
+      for(model = 0; model < partitions->numberOfPartitions; model++) 
+        tr->fracchange += partitions->partitionData[model]->fracchange;
+      
+      tr->fracchange /= ((double)partitions->numberOfPartitions);
+    }  
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  pllMasterBarrier(tr, partitions, PLL_THREAD_COPY_INIT_MODEL);
+#endif
+}
+
+
+
+
diff --git a/pll/newick.c b/pll/newick.c
new file mode 100644
index 0000000..ceb9653
--- /dev/null
+++ b/pll/newick.c
@@ -0,0 +1,583 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file newick.c
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+
+/** @file  newick.c
+
+    @brief Collection of routines for reading and parsing newick trees
+
+    Auxiliary functions for reading and parsing newick tree formats
+*/
+
+
+/** @defgroup newickParseGroup Reading and parsing newick trees
+    
+    This set of functions handles the reading and parsing of newick tree formats
+*/
+
+static int
+parse_newick (pllStack ** stack, int * inp)
+{
+  pllNewickNodeInfo * item = NULL;
+  int item_active = 0;
+  pllLexToken token;
+  int input;
+  pllLexToken prev_token;
+  int nop = 0;          /* number of open parentheses */
+  int depth = 0;
+
+  prev_token.tokenType = PLL_TOKEN_UNKNOWN;
+
+  input = *inp;
+
+  NEXT_TOKEN
+  
+  while (token.tokenType != PLL_TOKEN_EOF && token.tokenType != PLL_TOKEN_UNKNOWN)
+  {
+    switch (token.tokenType)
+     {
+       case PLL_TOKEN_OPAREN:
+#ifdef PLLDEBUG
+       printf ("PLL_TOKEN_OPAREN\n");
+#endif
+        ++nop;
+        memcpy (&prev_token, &token, sizeof (pllLexToken));
+        ++depth;
+        break;
+
+       case PLL_TOKEN_CPAREN:
+#ifdef PLLDEBUG
+       printf ("PLL_TOKEN_CPAREN\n");
+#endif
+        if (prev_token.tokenType != PLL_TOKEN_CPAREN  &&
+            prev_token.tokenType != PLL_TOKEN_UNKNOWN &&
+            prev_token.tokenType != PLL_TOKEN_STRING  &&
+            prev_token.tokenType != PLL_TOKEN_NUMBER  &&
+            prev_token.tokenType != PLL_TOKEN_FLOAT) return (0);
+
+        if (!nop) return (0);
+        --nop;
+        memcpy (&prev_token, &token, sizeof (pllLexToken));
+
+        /* push to the stack */
+        if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); // possibly not nec
+        //if (item->name   == NULL) item->name   = strdup ("INTERNAL_NODE");
+        if (item->name == NULL) 
+         {
+           item->name = (char *) rax_malloc ((strlen("INTERNAL_NODE") + 1) * sizeof (char));
+           strcpy (item->name, "INTERNAL_NODE");
+         }
+
+        //if (item->branch == NULL) item->branch = strdup ("0.000000"); 
+        if (item->branch == NULL) 
+         {
+           item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
+           strcpy (item->branch, "0.000000");
+         }
+        item->depth = depth;
+        pllStackPush (stack, item);
+        item_active  = 1;       /* active = 1 */
+        item = NULL;
+        --depth;
+        break;
+
+       case PLL_TOKEN_STRING:
+#ifdef PLLDEBUG
+       printf ("PLL_TOKEN_STRING      %.*s\n", token.len, token.lexeme);
+#endif
+        if (prev_token.tokenType != PLL_TOKEN_OPAREN &&
+            prev_token.tokenType != PLL_TOKEN_CPAREN &&
+            prev_token.tokenType != PLL_TOKEN_UNKNOWN &&
+            prev_token.tokenType != PLL_TOKEN_COMMA) return (0);
+        if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
+        item->name = my_strndup (token.lexeme, token.len);
+
+        item_active = 1;
+        item->depth = depth;
+        if (prev_token.tokenType == PLL_TOKEN_COMMA  ||
+            prev_token.tokenType == PLL_TOKEN_OPAREN ||
+            prev_token.tokenType == PLL_TOKEN_UNKNOWN) item->leaf = 1;
+        memcpy (&prev_token, &token, sizeof (pllLexToken));
+        break;
+
+       case PLL_TOKEN_FLOAT:
+       case PLL_TOKEN_NUMBER:
+#ifdef PLLDEBUG
+       if (token.tokenType == PLL_TOKEN_FLOAT) printf ("PLL_TOKEN_FLOAT\n"); else printf ("PLL_TOKEN_NUMBER\n");
+#endif
+         if  (prev_token.tokenType != PLL_TOKEN_OPAREN &&
+              prev_token.tokenType != PLL_TOKEN_CPAREN &&
+              prev_token.tokenType != PLL_TOKEN_COLON  &&
+              prev_token.tokenType != PLL_TOKEN_UNKNOWN &&
+              prev_token.tokenType != PLL_TOKEN_COMMA) return (0);
+        if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
+        if (prev_token.tokenType == PLL_TOKEN_COLON)
+         {
+           item->branch = my_strndup (token.lexeme, token.len);
+         }
+        else
+         {
+           if (prev_token.tokenType == PLL_TOKEN_COMMA  ||
+               prev_token.tokenType == PLL_TOKEN_OPAREN ||
+               prev_token.tokenType == PLL_TOKEN_UNKNOWN) item->leaf = 1;
+           //if (prev_token.tokenType != PLL_TOKEN_UNKNOWN) ++ indent;
+           item->name = my_strndup (token.lexeme, token.len);
+         }
+        item_active = 1;
+        item->depth = depth;
+        memcpy (&prev_token, &token, sizeof (pllLexToken));
+        break;
+
+       case PLL_TOKEN_COLON:
+#ifdef PLLDEBUG
+       printf ("PLL_TOKEN_COLON\n");
+#endif
+        if (prev_token.tokenType != PLL_TOKEN_CPAREN &&
+            prev_token.tokenType != PLL_TOKEN_STRING &&
+            prev_token.tokenType != PLL_TOKEN_FLOAT  &&
+            prev_token.tokenType != PLL_TOKEN_NUMBER) return (0);
+        memcpy (&prev_token, &token, sizeof (pllLexToken));
+        break;
+
+       case PLL_TOKEN_COMMA:
+#ifdef PLLDEBUG
+       printf ("PLL_TOKEN_COMMA\n");
+#endif
+        if (prev_token.tokenType != PLL_TOKEN_CPAREN &&
+             prev_token.tokenType != PLL_TOKEN_STRING &&
+             prev_token.tokenType != PLL_TOKEN_FLOAT && 
+             prev_token.tokenType != PLL_TOKEN_NUMBER) return (0);
+        memcpy (&prev_token, &token, sizeof (pllLexToken));
+        
+        /* push to the stack */
+        if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); // possibly not nece
+        //if (item->name   == NULL) item->name   = strdup ("INTERNAL_NODE");
+        if (item->name == NULL) 
+         {
+           item->name = (char *) rax_malloc ((strlen("INTERNAL_NODE") + 1) * sizeof (char));
+           strcpy (item->name, "INTERNAL_NODE");
+         }
+        //if (item->branch == NULL) item->branch = strdup ("0.000000"); 
+        if (item->branch == NULL) 
+         {
+           item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
+           strcpy (item->branch, "0.000000");
+         }
+        item->depth = depth;
+        pllStackPush (stack, item);
+        item_active  = 0;
+        item = NULL;
+        break;
+
+       case PLL_TOKEN_SEMICOLON:
+#ifdef PLLDEBUG
+        printf ("PLL_TOKEN_SEMICOLON\n");
+#endif
+        /* push to the stack */
+        if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
+        //if (item->name   == NULL) item->name   = strdup ("ROOT_NODE");
+        if (item->name == NULL) 
+         {
+           item->name = (char *) rax_malloc ((strlen("ROOT_NODE") + 1) * sizeof (char));
+           strcpy (item->name, "ROOT_NODE");
+         }
+        //if (item->branch == NULL) item->branch = strdup ("0.000000"); 
+        if (item->branch == NULL) 
+         {
+           item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
+           strcpy (item->branch, "0.000000");
+         }
+        pllStackPush (stack, item);
+        item_active  = 0;
+        item = NULL;
+        break;
+       default:
+#ifdef __DEBUGGING_MODE
+         printf ("Unknown token: %d\n", token.tokenType);
+#endif
+       // TODO: Finish this part and add error codes
+        break;
+     }
+    NEXT_TOKEN
+    CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE);
+  }
+  if (item_active)
+   {
+     if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
+     //if (item->name   == NULL) item->name   = strdup ("ROOT_NODE");
+     if (item->name == NULL) 
+      {
+        item->name = (char *) rax_malloc ((strlen("ROOT_NODE") + 1) * sizeof (char));
+        strcpy (item->name, "ROOT_NODE");
+      }
+     //if (item->branch == NULL) item->branch = strdup ("0.000000"); 
+     if (item->branch == NULL) 
+      {
+        item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
+        strcpy (item->branch, "0.000000");
+      }
+     pllStackPush (stack, item);
+     item_active  = 0;
+   }
+
+  if (nop || token.tokenType == PLL_TOKEN_UNKNOWN) 
+   {
+     return (0);
+   }
+
+  return (1);
+}
+
+#ifdef __DEBUGGING_MODE
+void stack_dump(pllStack ** stack)
+{
+  pllNewickNodeInfo * item;
+  pllStack * head;
+  int i;
+
+  head = *stack;
+  while (head)
+   {
+     item = (pllNewickNodeInfo *) head->item;
+
+     for (i = 0; i < item->depth; ++ i) printf ("\t");
+
+     printf ("%s:%s\n", item->name, item->branch);
+
+     head = head->next;
+   }
+}
+#endif
+
+static void
+assign_ranks (pllStack * stack, int * nodes, int * leaves)
+{
+  pllStack * head;
+  pllNewickNodeInfo * item, * tmp;
+  pllStack * preorder = NULL;
+  int children;
+  int depth;
+
+  *nodes = *leaves = 0;
+
+
+  head = stack;
+  while (head)
+  {
+    assert (head->item);
+    item = (pllNewickNodeInfo *) head->item;
+    
+    if (item->leaf)  ++ (*leaves);
+
+    if (preorder)
+     {
+       tmp = (pllNewickNodeInfo *) preorder->item;
+       children = 0;
+       while (item->depth < tmp->depth)
+        {
+          children = 1;
+          depth = tmp->depth;
+          pllStackPop (&preorder);
+          tmp = preorder->item;
+          while (tmp->depth == depth)
+           {
+             ++ children;
+             pllStackPop (&preorder);
+             tmp = (pllNewickNodeInfo *)preorder->item;
+           }
+          tmp->rank += children;
+        }
+     }
+    
+    ++ (*nodes);
+    head = head->next;
+
+    if (item->leaf)
+     {
+       if (!preorder) return;
+
+       children = 1;
+       tmp = preorder->item;
+       while (tmp->depth == item->depth)
+        {
+          ++ children;
+          pllStackPop (&preorder);
+          assert (preorder);
+          tmp = (pllNewickNodeInfo *)preorder->item;
+        }
+       tmp->rank += children;
+     }
+    else
+     {
+       pllStackPush (&preorder, item);
+     }
+  }
+  
+  while (preorder->item != stack->item)
+  {
+    item = (pllNewickNodeInfo *)pllStackPop (&preorder);
+    tmp  = (pllNewickNodeInfo *) preorder->item;
+    children = 1;
+
+    while (tmp->depth == item->depth)
+     {
+       ++ children;
+       item = (pllNewickNodeInfo *) pllStackPop (&preorder);
+       tmp  = (pllNewickNodeInfo *) preorder->item;
+     }
+    tmp->rank += children;
+    children = 0;
+  }
+ assert (preorder->item == stack->item);
+ 
+ pllStackClear (&preorder);
+}
+
+/** @ingroup newickParseGroup
+    @brief Validate if a newick tree is a valid phylogenetic tree
+
+    A valid tree is one where the root node is binary or ternary
+    and all other internal nodes are binary. In case the root
+    is ternary then the tree must contain at least another internal
+    node and the total number of nodes must be equal to 
+    \f$ 2l - 2\f$, where \f$l\f$ is the number of leaves. If the
+    root is binary, then the total number of nodes must be equal
+    to \f$2l - 1\f$.
+
+    @param tree
+      Newick tree wrapper structure which contains the stack representation of the parsed newick tree
+
+    @return
+      Returns \b 1 in case of success, otherwise \b 0
+*/
+int
+pllValidateNewick (pllNewickTree * t)
+{
+  pllStack * head;
+  pllNewickNodeInfo * item;
+  int correct = 0;
+ 
+  item = t->tree->item;
+  if (item->rank != 2 && item->rank != 3) return (0);
+  head = t->tree->next;
+  while (head)
+  {
+    item = head->item;
+    if (item->rank != 2 && item->rank != 0) 
+     {
+       return (0);
+     }
+    head = head->next;
+  }
+  
+  item = t->tree->item;
+
+  if (item->rank == 2) 
+   {
+     correct = (t->nodes == 2 * t->tips -1);
+     if (correct)
+      {
+        errno = PLL_NEWICK_ROOTED_TREE;
+      }
+     else
+      {
+        errno = PLL_NEWICK_BAD_STRUCTURE;
+      }
+     return (PLL_FALSE);
+   }
+   
+  
+  correct = ((t->nodes == 2 * t->tips - 2) && t->nodes != 4);
+  if (correct) return (PLL_TRUE);
+
+  errno = PLL_NEWICK_BAD_STRUCTURE;
+
+  return (1);
+}
+
+
+/** @ingroup newickParseGroup
+    @brief Convert a binary rooted trree to a binary unrooted tree
+
+    Changes the root of the node to have 3 descendants instead of two, deletes its last immediate descendant internal node
+    and takes the two children (of the deleted internal node) as its children.
+
+    @param
+      Newick tree
+    
+    @return
+      \b PLL_TRUE in case of success, otherwise \b PLL_FALSE and \a errno is set
+*/
+int
+pllNewickUnroot (pllNewickTree * t)
+{
+  pllStack * tmp;
+  pllNewickNodeInfo * item;
+
+  item = t->tree->item;
+  if (item->rank == 2)
+   {
+     item->rank = 3;
+     t->nodes--;
+     item = t->tree->next->item;
+     if (item->rank == 0)
+      {
+        tmp = t->tree->next->next;
+        t->tree->next->next = t->tree->next->next->next;
+      }
+     else
+      {
+        tmp = t->tree->next;
+        t->tree->next = t->tree->next->next;
+      }
+     item = tmp->item;
+     rax_free (item->name);
+     rax_free (tmp->item);
+     rax_free (tmp);
+   }
+
+  return (pllValidateNewick (t));
+}
+
+
+/** @ingroup newickParseGroup
+    @brief Parse a newick tree string
+  
+    Parse a newick string and create a stack structure which represents the tree
+    in a preorder traversal form. Each element of the stack represents one node
+    and consists of its name, branch length, number of children and depth. The
+    stack structure is finally wrapped in a \a pllNewickTree structure which
+    also contains the number of nodes and leaves.
+
+    @param newick
+      String containing the newick tree
+
+    @return
+      Returns a pointer to the created \a pllNewickTree structure in case of success, otherwise \b NULL
+*/
+pllNewickTree *
+pllNewickParseString (const char * newick)
+{
+  int n, input, rc;
+  pllNewickTree * t;
+  int nodes, leaves;
+  
+  t = (pllNewickTree *) rax_calloc (1, sizeof (pllNewickTree));
+
+  n = strlen (newick);
+
+  init_lexan (newick, n);
+  input = get_next_symbol();
+
+  rc = parse_newick (&(t->tree), &input);
+  if (!rc)
+   {
+     /* TODO: properly clean t->tree */
+     rax_free (t);
+     t = NULL;
+   }
+  else
+   {
+     assign_ranks (t->tree, &nodes, &leaves);
+     t->nodes = nodes;
+     t->tips  = leaves;
+   }
+
+  return (t);
+}
+
+/** @ingroup newickParseGroup
+    @brief Deallocate newick parser stack structure
+
+    Deallocates the newick parser stack structure that represents the parsed tree. It
+    also frees all memory allocated by elements of the stack structure.
+
+    @param tree
+      The tree stack structure
+*/
+void pllNewickParseDestroy (pllNewickTree ** t)
+{
+  pllNewickNodeInfo *  item;
+
+  while ((item = (pllNewickNodeInfo *)pllStackPop (&((*t)->tree))))
+   {
+     rax_free (item->name);
+     rax_free (item->branch);
+     rax_free (item);
+   }
+  rax_free (*t);
+  (*t) = NULL;
+}
+
+/** @ingroup newickParseGroup
+    @brief Parse a newick tree file
+  
+    Parse a newick file and create a stack structure which represents the tree
+    in a preorder traversal form. Each element of the stack represents one node
+    and consists of its name, branch length, number of children (rank) and depth. The
+    stack structure is finally wrapped in a \a pllNewickTree structure which
+    also contains the number of nodes and leaves.
+
+    @param filename
+      Filename containing the newick tree
+
+    @return
+      Returns a pointer to the created \a pllNewickTree structure in case of success, otherwise \b NULL
+*/
+pllNewickTree *
+pllNewickParseFile (const char * filename)
+{
+  long n;
+  char * rawdata;
+  pllNewickTree * t;
+
+  rawdata = pllReadFile (filename, &n);
+  if (!rawdata)
+   {
+     fprintf (stderr, "Error while opening/reading file %s\n", filename);
+     return (0);
+   }
+
+  //printf ("%s\n\n", rawdata);
+
+  t = pllNewickParseString (rawdata);
+
+  rax_free (rawdata);
+
+  return (t);
+}
+
diff --git a/pll/newick.h b/pll/newick.h
new file mode 100644
index 0000000..8810598
--- /dev/null
+++ b/pll/newick.h
@@ -0,0 +1,61 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file newick.h
+ */
+#ifndef __pll_NEWICK__
+#define __pll_NEWICK__
+#include "stack.h"
+/** @brief Intermediate structure for storing a newick tree 
+    
+    Holds the structure of a parsed newick tree. The number of inner nodes is stored in \a nodes
+*/
+typedef struct
+{
+  int nodes;                    /**< @brief Total number of nodes in the tree == 2*tips - 1 for rooted and 2*tips -2 for unrooted */
+  int tips;                     /**< @brief Number of leaves (tips) in the tree */
+  pllStack * tree;              /**< @brief Parsed tree represented as elements of a stack. Corresponds to placing the postorder traversal of a rooted tree in a pushdown store */
+} pllNewickTree;
+
+
+/** @brief Information describing the parsed newick tree nodes 
+    
+    This structure is placed in the ::pllNewickTree LIFO element pllNewickTree::tree
+    and described each node of the parsed tree.
+
+    @todo Rename this to something more proper
+*/
+typedef struct
+{
+  int depth;                    /**< @brief Distance of node from root */
+  char * name;                  /**< @brief Name of the taxon represented by the node (in case it is a leaf) */
+  char * branch;                /**< @brief Length of branch that leads to its parent */
+  int leaf;                     /**< @brief \b PLL_TRUE if the node is a leaf, otherwise \b PLL_FALSE */
+  int rank;                     /**< @brief Rank of the node, i.e. how many children it has */
+} pllNewickNodeInfo;
+
+
+#endif
diff --git a/pll/newviewGenericSpecial.c b/pll/newviewGenericSpecial.c
new file mode 100644
index 0000000..e69d7f2
--- /dev/null
+++ b/pll/newviewGenericSpecial.c
@@ -0,0 +1,8736 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file newviewGenericSpecial.c
+ *  
+ * @brief Functions that deal (mostly) with conditional likelihood (re)computation
+ */
+
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+#ifdef __MIC_NATIVE
+#include "mic_native.h"
+#endif
+
+
+#ifdef __SSE3
+#include <stdint.h>
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+#include "cycle.h"
+
+static void computeTraversalInfo(nodeptr, traversalInfo *, int *, int, int, pllBoolean, recompVectors *, pllBoolean);
+static void makeP(double z1, double z2, double *rptr, double *EI,  double *EIGN, int numberOfCategories, double *left, double *right, pllBoolean saveMem, int maxCat, const int states);
+#if (defined(__SSE3) && !defined(__AVX))
+static void newviewGTRGAMMAPROT_LG4(int tipCase,
+                                    double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
+                                    int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                    int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+
+static void newviewGTRGAMMA_GAPPED_SAVE(int tipCase,
+                                        double *x1_start, double *x2_start, double *x3_start,
+                                        double *EV, double *tipVector,
+                                        int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                        const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                        unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, 
+                                        double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn);
+
+static void newviewGTRGAMMA(int tipCase,
+                            double *x1_start, double *x2_start, double *x3_start,
+                            double *EV, double *tipVector,
+                            int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                            const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling
+                            );
+
+static void newviewGTRCAT( int tipCase,  double *EV,  int *cptr,
+                           double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+                           int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                           int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling);
+
+
+static void newviewGTRCAT_SAVE( int tipCase,  double *EV,  int *cptr,
+                                double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+                                int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats);
+
+static void newviewGTRGAMMAPROT_GAPPED_SAVE(int tipCase,
+                                            double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                                            int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                            int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                            unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,  
+                                            double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn
+                                            );
+
+static void newviewGTRGAMMAPROT(int tipCase,
+                                double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                                int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling);
+
+static void newviewGTRCATPROT(int tipCase, double *extEV,
+                              int *cptr,
+                              double *x1, double *x2, double *x3, double *tipVector,
+                              int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                              int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling);
+
+static void newviewGTRCATPROT_SAVE(int tipCase, double *extEV,
+                                   int *cptr,
+                                   double *x1, double *x2, double *x3, double *tipVector,
+                                   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                   int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                   unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                   double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats);
+
+#endif
+#if (defined(__AVX) || defined(__SSE3))
+static void newviewGTRCAT_BINARY( int tipCase,  double *EV,  int *cptr,
+                                  double *x1_start,  double *x2_start,  double *x3_start,  double *tipVector,
+                                  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                  int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+static void newviewGTRGAMMA_BINARY(int tipCase,
+                                   double *x1_start, double *x2_start, double *x3_start,
+                                   double *EV, double *tipVector,
+                                   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                   const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+#endif
+
+/* required to compute the absolute values of double precision numbers with SSE3 */
+
+PLL_ALIGN_BEGIN const union PLL_ALIGN_END
+{
+  uint64_t i[2];
+  __m128d m;
+} absMask = {{0x7fffffffffffffffULL , 0x7fffffffffffffffULL }};
+
+
+
+#endif
+
+static int pllGetTransitionMatrixNormal (pllInstance * tr, partitionList * pr, nodeptr p, int model, int rate, double * outBuffer);
+static int pllGetTransitionMatrixLG4 (partitionList * pr, nodeptr p, int model, double * outBuffer);
+
+extern const char binaryStateNames[2];  /**< @brief Alphabet of binary states */
+extern const char dnaStateNames[4];     /**< @brief DNA alphabet  */
+extern const char protStateNames[20];   /**< @brief Amino-acid alphabet */
+extern const unsigned int mask32[32];   /**< @brief Contains the first 32 powers of 2, i.e. 2^0 upto 2^31 */
+
+static void ascertainmentBiasSequence(unsigned char tip[32], int numStates)
+{ 
+  assert(numStates <= 32 && numStates > 1);
+
+  switch(numStates)
+    {
+    case 2:     
+      tip[0] = 1;
+      tip[1] = 2;
+      break;
+    case 4:
+      tip[0] = 1;
+      tip[1] = 2;
+      tip[2] = 4;
+      tip[3] = 8;
+      break;
+    default:
+      {
+	int 
+	  i;
+	for(i = 0; i < numStates; i++)
+	  {
+	    tip[i] = i;
+	    //printf("%c ", inverseMeaningPROT[i]);
+	  }
+	//printf("\n");
+      }
+      break;
+    }
+}
+
+static void newviewAscCat(int tipCase,
+			  double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+			  int *ex3, 
+			  const int n, double *left, double *right, 			    
+			  const int numStates)
+{
+  double
+    *le, *ri, *v, *vl, *vr,
+    ump_x1, ump_x2, x1px2;
+  
+  int 
+    i, l, j, scale;
+
+ 
+  unsigned char 
+    tip[32];
+
+  ascertainmentBiasSequence(tip, numStates);
+  
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+	for (i = 0; i < n; i++)
+	  {
+	    le = &left[0];
+	    ri = &right[0];
+
+	    vl = &(tipVector[numStates * tip[i]]);
+	    vr = &(tipVector[numStates * tip[i]]);
+	    v  = &x3[numStates * i];
+
+	    for(l = 0; l < numStates; l++)
+	      v[l] = 0.0;
+
+	    for(l = 0; l < numStates; l++)
+	      {
+		ump_x1 = 0.0;
+		ump_x2 = 0.0;
+
+		for(j = 0; j < numStates; j++)
+		  {
+		    ump_x1 += vl[j] * le[l * numStates + j];
+		    ump_x2 += vr[j] * ri[l * numStates + j];
+		  }
+
+		x1px2 = ump_x1 * ump_x2;
+
+		for(j = 0; j < numStates; j++)
+		  v[j] += x1px2 * extEV[l * numStates + j];
+	      }	    
+	  }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+	for (i = 0; i < n; i++)
+	  {
+	    le = &left[0];
+	    ri = &right[0];
+
+	    vl = &(tipVector[numStates * tip[i]]);
+	    vr = &x2[numStates * i];
+	    v  = &x3[numStates * i];
+
+	    for(l = 0; l < numStates; l++)
+	      v[l] = 0.0;
+
+	    for(l = 0; l < numStates; l++)
+	      {
+		ump_x1 = 0.0;
+		ump_x2 = 0.0;
+
+		for(j = 0; j < numStates; j++)
+		  {
+		    ump_x1 += vl[j] * le[l * numStates + j];
+		    ump_x2 += vr[j] * ri[l * numStates + j];
+		  }
+
+		x1px2 = ump_x1 * ump_x2;
+
+		for(j = 0; j < numStates; j++)
+		  v[j] += x1px2 * extEV[l * numStates + j];
+	      }
+
+	    scale = 1;
+	    for(l = 0; scale && (l < numStates); l++)
+	      scale = ((v[l] < PLL_MINLIKELIHOOD) && (v[l] > PLL_MINUSMINLIKELIHOOD));	    
+
+	    if(scale)
+	      {
+		for(l = 0; l < numStates; l++)
+		  v[l] *= PLL_TWOTOTHE256;
+			
+		ex3[i]  += 1;	      
+	      }
+	  }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+	{
+	  le = &left[0];
+	  ri = &right[0];
+
+	  vl = &x1[numStates * i];
+	  vr = &x2[numStates * i];
+	  v = &x3[numStates * i];
+
+	  for(l = 0; l < numStates; l++)
+	    v[l] = 0.0;
+
+	  for(l = 0; l < numStates; l++)
+	    {
+	      ump_x1 = 0.0;
+	      ump_x2 = 0.0;
+
+	      for(j = 0; j < numStates; j++)
+		{
+		  ump_x1 += vl[j] * le[l * numStates + j];
+		  ump_x2 += vr[j] * ri[l * numStates + j];
+		}
+
+	      x1px2 =  ump_x1 * ump_x2;
+
+	      for(j = 0; j < numStates; j++)
+		v[j] += x1px2 * extEV[l * numStates + j];
+	    }
+
+	   scale = 1;
+	   for(l = 0; scale && (l < numStates); l++)
+	     scale = ((v[l] < PLL_MINLIKELIHOOD) && (v[l] > PLL_MINUSMINLIKELIHOOD));
+	  
+	   if(scale)
+	     {
+	       for(l = 0; l < numStates; l++)
+		 v[l] *= PLL_TWOTOTHE256;
+	      
+	       ex3[i]  += 1;	     
+	     }
+	}
+      break;
+    default:
+      assert(0);
+    }
+  
+ 
+
+}
+
+
+static void newviewAscGamma(int tipCase,
+			    double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+			    int *ex3, 
+			    const int n, double *left, double *right, 			    
+			    const int numStates)
+{
+  
+  int  
+    i, j, l, k, scale;
+  
+  const int 
+    statesSquare = numStates * numStates,
+    gammaStates = 4 * numStates;
+
+  double 
+    *vl, *vr, al, ar, *v, x1px2;
+
+  unsigned char 
+    tip[32];
+
+  ascertainmentBiasSequence(tip, numStates);
+  
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+	for(i = 0; i < n; i++)
+	  {
+	    for(k = 0; k < 4; k++)
+	      {
+		vl = &(tipVector[numStates * tip[i]]);
+		vr = &(tipVector[numStates * tip[i]]);
+		v =  &(x3[gammaStates * i + numStates * k]);
+
+		for(l = 0; l < numStates; l++)
+		  v[l] = 0;
+
+		for(l = 0; l < numStates; l++)
+		  {
+		    al = 0.0;
+		    ar = 0.0;
+		    for(j = 0; j < numStates; j++)
+		      {
+			al += vl[j] * left[k * statesSquare + l * numStates + j];
+			ar += vr[j] * right[k * statesSquare + l * numStates + j];
+		      }
+
+		    x1px2 = al * ar;
+		    for(j = 0; j < numStates; j++)
+		      v[j] += x1px2 * extEV[numStates * l + j];
+		  }
+	      }	    
+	  }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+	for (i = 0; i < n; i++)
+	  {
+	    for(k = 0; k < 4; k++)
+	      {
+		vl = &(tipVector[numStates * tip[i]]);
+		vr = &(x2[gammaStates * i + numStates * k]);
+		v =  &(x3[gammaStates * i + numStates * k]);
+
+		for(l = 0; l < numStates; l++)
+		  v[l] = 0;
+
+		for(l = 0; l < numStates; l++)
+		  {
+		    al = 0.0;
+		    ar = 0.0;
+		    for(j = 0; j < numStates; j++)
+		      {
+			al += vl[j] * left[k * statesSquare + l * numStates + j];
+			ar += vr[j] * right[k * statesSquare + l * numStates + j];
+		      }
+
+		    x1px2 = al * ar;
+		    for(j = 0; j < numStates; j++)
+		      v[j] += x1px2 * extEV[numStates * l + j];
+		  }
+	      }
+	   
+	    v = &x3[gammaStates * i];
+	    scale = 1;
+	    for(l = 0; scale && (l < gammaStates); l++)
+	      scale = (PLL_ABS(v[l]) < PLL_MINLIKELIHOOD);
+
+	    if(scale)
+	      {		
+		for(l = 0; l < gammaStates; l++)
+		  v[l] *= PLL_TWOTOTHE256;
+		
+		ex3[i]  += 1;	      
+	      }
+	  }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+       {
+	 for(k = 0; k < 4; k++)
+	   {
+	     vl = &(x1[gammaStates * i + numStates * k]);
+	     vr = &(x2[gammaStates * i + numStates * k]);
+	     v =  &(x3[gammaStates * i + numStates * k]);
+
+	     for(l = 0; l < numStates; l++)
+	       v[l] = 0;
+
+	     for(l = 0; l < numStates; l++)
+	       {
+		 al = 0.0;
+		 ar = 0.0;
+		 for(j = 0; j < numStates; j++)
+		   {
+		     al += vl[j] * left[k * statesSquare + l * numStates + j];
+		     ar += vr[j] * right[k * statesSquare + l * numStates + j];
+		   }
+
+		 x1px2 = al * ar;
+		 for(j = 0; j < numStates; j++)
+		   v[j] += x1px2 * extEV[numStates * l + j];
+	       }
+	   }
+	 
+	 v = &(x3[gammaStates * i]);
+	 scale = 1;
+	 for(l = 0; scale && (l < gammaStates); l++)
+	   scale = ((PLL_ABS(v[l]) <  PLL_MINLIKELIHOOD));
+
+	 if(scale)
+	   {	    
+	     for(l = 0; l < gammaStates; l++)
+	       v[l] *= PLL_TWOTOTHE256;
+	     
+	     ex3[i]  += 1;	    
+	   }
+       }
+      break;
+    default:
+      assert(0);
+    }  
+}
+
+
+/* generic function for computing the P matrices, for computing the conditional likelihood at a node p, given child nodes q and r 
+   we compute P(z1) and P(z2) here */
+
+/** @brief Computes two P matrices for two edges.
+
+    Generic function for computing the P matrices of two nodes based on their edges. This is used to 
+    (later) compute the the conditional likelihood at a node p which has two descendants \a q and \r, 
+    which in turn have the edges \a z1 and \a z2 that connect them with \a p. Given those edges, we
+    compute two P matrices \a P(z1) and \a P(z2) which are stored in the arrays \a left and \a right.
+ 
+    The following value is computed here: 
+    \f[
+     EI\cdot exp( EIGN \cdot z)
+     \f]
+     to fill up the P matrix.
+     
+    @param z1    Branch length leading to left descendant node (let's call it \a q)
+    @param z2    Branch length leading to right descendant node (let's call it \a r)
+    @param rptr  Array of values for rate categories
+    @param EI    Inverse eigenvectors of Q-matrix
+    @param EIGN  Eigenvalues of Q-matrix
+    @param numberOfCategories How many rate heterogeneity categories we have, depending on GAMMA and CAT
+    @param left  Where to store the left P matrix (for node \a q)
+    @param right Where to store the right P matrix (for node \a r)
+    @param saveMem If set to \b PLL_TRUE, memory saving technique is enabled
+    @param maxCat Maximum number of rate categories
+    @param states Number of states for the particular data (4 for DNA or 20 for AA)
+*/
+static void 
+makeP(double z1, double z2, double *rptr, double *EI,  double *EIGN, int numberOfCategories, double *left, double *right, pllBoolean saveMem, int maxCat, const int states)
+{
+  int  i, j, k, statesSquare = states * states;
+
+  /* assign some space for pre-computing and later re-using functions */
+
+  double 
+    *lz1 = (double*)rax_malloc(sizeof(double) * states),
+    *lz2 = (double*)rax_malloc(sizeof(double) * states),
+    *d1 = (double*)rax_malloc(sizeof(double) * states),
+    *d2 = (double*)rax_malloc(sizeof(double) * states);
+
+  /* multiply branch lengths with eigenvalues */
+
+  for(i = 1; i < states; i++)
+  {
+    lz1[i] = EIGN[i] * z1;
+    lz2[i] = EIGN[i] * z2;
+  }
+
+
+  /* loop over the number of rate categories, this will be 4 for the GAMMA model and 
+     variable for the CAT model */
+
+  for(i = 0; i < numberOfCategories; i++)
+  {
+    /* exponentiate the rate multiplied by the branch */
+
+    for(j = 1; j < states; j++)
+    {
+      d1[j] = exp(rptr[i] * lz1[j]);
+      d2[j] = exp(rptr[i] * lz2[j]);
+
+    }
+
+    /* now fill the P matrices for the two branch length values */
+
+    for(j = 0; j < states; j++)
+    {
+      /* left and right are pre-allocated arrays */
+
+      left[statesSquare * i  + states * j] = 1.0;
+      right[statesSquare * i + states * j] = 1.0;         
+
+      for(k = 1; k < states; k++)
+      {
+        left[statesSquare * i + states * j + k]  = d1[k] * EI[states * j + k];
+        right[statesSquare * i + states * j + k] = d2[k] * EI[states * j + k];
+      }
+    }
+  }
+
+
+  /* if memory saving is enabled and we are using CAT we need to do one additional P matrix 
+     calculation for a rate of 1.0 to compute the entries of a column/tree site comprising only gaps */
+
+
+  if(saveMem)
+  {
+    i = maxCat;
+
+    for(j = 1; j < states; j++)
+    {
+      d1[j] = exp (lz1[j]);
+      d2[j] = exp (lz2[j]);
+    }
+
+    for(j = 0; j < states; j++)
+    {
+      left[statesSquare * i  + states * j] = 1.0;
+      right[statesSquare * i + states * j] = 1.0;
+
+      for(k = 1; k < states; k++)
+      {
+        left[statesSquare * i + states * j + k]  = d1[k] * EI[states * j + k];
+        right[statesSquare * i + states * j + k] = d2[k] * EI[states * j + k];
+      }
+    }
+  }
+
+  /* free the temporary buffers */
+
+  rax_free(lz1);
+  rax_free(lz2);
+  rax_free(d1);
+  rax_free(d2);
+}
+
+
+/** Compute the transition probability matrix for a given branch
+
+    Computes the transition probability matrix for the branch \a p->z and partition \a model given the
+    PLL instance \a tr and list of partitions \a pr. The result is stored in \a outBuffer which must
+    be of sufficient size, i.e states * states * (numberOfRateCategories + 1) * sizeof(double);
+
+    @param tr  PLL instance
+    @param pr  List of partitions
+    @param model  Partition index for which to take the branch length
+    @param p  Adjacent node to the edge we want to compute the trans. prob. matrix
+    @param outBuffer Output buffer where to store the transition probability matrix
+
+*/
+int pllGetTransitionMatrix (pllInstance * tr, partitionList * pr, nodeptr p, int model, int rate, double * outBuffer)
+{
+  if (tr->rateHetModel == PLL_CAT)
+   {
+     if (rate >= pr->partitionData[model]->numberOfCategories) return (PLL_FALSE);
+   }
+  else
+   {
+     if (rate >= 4) return (PLL_FALSE);
+   }
+
+  if (pr->partitionData[model]->dataType == PLL_AA_DATA &&
+		  (pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X))
+    return (pllGetTransitionMatrixLG4 (pr, p, model, outBuffer));
+    
+    
+  return (pllGetTransitionMatrixNormal (tr, pr, p, model, rate, outBuffer));
+}
+
+
+/* TODO: Fix this function according to pllGetTransitionMatrixNormal */
+static int pllGetTransitionMatrixLG4 (partitionList * pr, nodeptr p, int model, double * outBuffer)
+{
+  int
+    i, j, k,
+    states = pr->partitionData[model]->states,
+    numberOfCategories = 4;
+  double
+    d[64],
+    *  rptr = pr->partitionData[model]->gammaRates,
+    ** EI   = pr->partitionData[model]->EI_LG4,
+    ** EIGN = pr->partitionData[model]->EIGN_LG4;
+
+  assert (states == 20);
+
+  for (i = 0; i < numberOfCategories; ++i)
+   {
+     for (j = 1; j < states; ++j)
+      {
+        d[j] = exp(rptr[i] * EIGN[i][j] * p->z[model]);
+      }
+     for (j = 0; j < states; ++ j)
+      {
+        outBuffer[states * states * i + states * j] = 1.0;
+        for (k = 1; k < states; ++k) 
+         {
+           outBuffer[states * states * i + states * j + k] = d[k] * EI[i][states * j + k];
+         }
+      }
+   }
+  return (PLL_TRUE);
+}
+
+static int pllGetTransitionMatrixNormal (pllInstance * tr, partitionList * pr, nodeptr p, int model, int rate, double * outBuffer)
+{
+  int 
+    i, j, k,
+    /* numberOfCategories, */
+    states = pr->partitionData[model]->states;
+  double
+    * d = (double *)rax_malloc(sizeof(double) * states),
+    * rptr,
+    * EI   = pr->partitionData[model]->EI,
+    * EIGN = pr->partitionData[model]->EIGN,
+    * EV = pr->partitionData[model]->EV;
+  
+  double lz = (p->z[model] > PLL_ZMIN) ? log(p->z[model]) : log(PLL_ZMIN);                        
+
+  if (tr->rateHetModel == PLL_CAT)
+   {
+     rptr               = pr->partitionData[model]->perSiteRates;
+     /* numberOfCategories = pr->partitionData[model]->numberOfCategories; */
+   }
+  else
+   {
+     rptr               = pr->partitionData[model]->gammaRates;
+     /* numberOfCategories = 4; */
+   }
+
+  for (i = 0; i < states * states; ++ i) outBuffer[i] = 0;
+
+  d[0] = 1.0;
+  for (j = 1; j < states; ++ j)
+   {
+     d[j] = exp(rptr[rate] * EIGN[j] * lz);
+   }
+
+  for (i = 0; i < states; ++ i)
+   {
+     for (j = 0; j < states; ++ j)
+      {
+        for (k = 0; k < states; ++ k)
+         {
+           outBuffer[states * i + j] += (d[k] * EI[states * i + k] * EV[states * j + k]);
+         }
+      }
+   }
+
+  assert (!tr->saveMemory);
+  // TODO: Fix the following snippet
+  //if (tr->saveMemory)
+  // {
+  //   i = tr->maxCategories;
+  //   
+  //   for (j = 1; j < states; ++j)
+  //    {
+  //      d[j] = EXP(EIGN[j] * p->z[model]);
+  //    }
+
+  //   for (j = 0; j < states; ++j)
+  //    {
+  //      outBuffer[states * states * i + states * j] = 1.0;
+  //      for (k = 1; k < states; ++k)
+  //       {
+  //         outBuffer[states * states * i + states * j + k] = d[k] * EI[states * j + k];
+  //       }
+  //    }
+  // }
+
+  rax_free(d);
+
+  return (PLL_TRUE);
+}
+
+
+/** @brief Compute two P matrices for two edges for the LG4 model
+    
+    Computing the P matrices of two nodes based on their edges for the LG4 model. This is used to 
+    (later) compute the the conditional likelihood at a node p which has two descendants \a q and \r, 
+    which in turn have the edges \a z1 and \a z2 that connect them with \a p. Given those edges, we
+    compute two P matrices \a P(z1) and \a P(z2) which are stored in the arrays \a left and \a right.
+
+    @param z1
+      Branch length leading to left descendant node (let's call it \a q)
+     
+    @param z2
+      Branch length leading to right descendant node (let's call it \a r)
+
+    @param rptr
+      Array of values for rate categories
+
+    @param EI
+      Inverse eigenvectors of 4 Q-matrices
+     
+    @param EIGN
+      Eigenvalues of 4 Q-matrix
+
+    @param numberOfCategories
+      How many rate heterogeneity categories we have, depending on GAMMA and CAT
+     
+    @param left
+      Where to store the left P matrix (for node \a q)
+     
+    @param right
+      Where to store the right P matrix (for node \a r)
+
+    @param numStates
+      Number of states for the particular data (4 for DNA or 20 for AA)
+
+    @todo
+      Present the maths here as in ::makeP
+
+*/
+static void makeP_FlexLG4(double z1, double z2, double *rptr, double *EI[4],  double *EIGN[4], int numberOfCategories, double *left, double *right, const int numStates)
+{
+  int 
+    i,
+    j,
+    k;
+  
+  const int
+    statesSquare = numStates * numStates;
+
+  double    
+    d1[64],  
+    d2[64];
+
+  assert(numStates <= 64);
+       
+  for(i = 0; i < numberOfCategories; i++)
+    {
+      for(j = 1; j < numStates; j++)
+        {
+          d1[j] = exp (rptr[i] * EIGN[i][j] * z1);
+          d2[j] = exp (rptr[i] * EIGN[i][j] * z2);
+        }
+
+      for(j = 0; j < numStates; j++)
+        {
+          left[statesSquare * i  + numStates * j] = 1.0;
+          right[statesSquare * i + numStates * j] = 1.0;
+
+          for(k = 1; k < numStates; k++)
+            {
+              left[statesSquare * i + numStates * j + k]  = d1[k] * EI[i][numStates * j + k];
+              right[statesSquare * i + numStates * j + k] = d2[k] * EI[i][numStates * j + k];
+            }
+        }
+    }  
+}
+
+#if (!defined(__AVX) && !defined(__SSE3))
+
+/** @brief Computation of conditional likelihood arrays for CAT
+ 
+    This is a generic, slow but readable function implementation for computing the 
+     conditional likelihood arrays at p, given child nodes q and r using the CAT
+     mode of rate heterogeneity. Depending whether \a q, resp. \r, are tips or internal
+     nodes (indicated by \a tipCase) the conditional likelihoods are computed based on
+     \a x1 if \a q is an inner node or \a tipX1 if it is a tip, resp. \a x2 if \a r
+     is an inner node or \a tipX2 if it is a tip. Output array \a ex3 stores the
+     number of times the likelihood of each site for each internal node has been scaled.
+     The conditional likelihood vectors for any possible base-pair (which is useful when
+     \a q or \a r are tips) has been already precomputed from the eigenvalues of the Q
+     matrix in the array \a tipVector. In case the conditional likelihood for a particular
+     site is very small in terms of a floating point number, then it is multiplied by a
+     very large number (scaling), and then number of times it has been scaled (per node) is
+     stored in the array \a ex3, if \a fastScaling is set to \b PLL_FALSE. Otherwise, the
+     total number of scalings for all sites and all nodes is stored in a single variable
+     \a scalerIncrement.
+
+    @param tipCase
+      Can be either \b PLL_TIP_TIP, or \b PLL_TIP_INNER or \b PLL_INNER_INNER, and describes the
+      descendants of the node for which we currently compute the condition likelihood
+      vector, i.e. whether they are both tips (leaves), or one is tip and the other
+      an inner node, or both are inner nodes.
+
+    @param extEV
+      Eigenvectors of Q matrix
+      
+    @param cptr
+      Array where the rate for each site in the compressed partition alignment is stored
+
+    @param x1
+      Conditional likelihood vectors of the first child node, in case it is an internal node
+
+    @param x2
+      Conditional likelihood vectors of the second child node, in case it is an internal node
+
+    @param x3
+      Pointer to where the computed conditional likelihood vector of node \a p will be stored
+
+    @param tipVector
+      Vector contining sums of left eigenvectors for likelihood computation at tips.
+
+    @param ex3
+      Pointer to an array of whose elements correspond to the number of times the likelihood of
+      a particular site of a particular internal nodeis scaled. Those elements are incremented
+      at every scaling operation and only if \a fastScaling flag is set to \b PLL_FALSE. This 
+      array will be used later when evaluating the likelihood of the whole tree.
+
+    @param tipX1
+      Pointer to the alignment data (sequence) of first child node, in case it is a tip
+
+    @param tipX2
+      Pointer to the alignment data (sequence) of second child node, in case it is a tip
+
+    @param n
+      Number of sites for which we are doing the evaluation. For the single-thread version this is the number of sites in the
+      current partition, for multi-threads this is the number of sites assigned to the running thread from the current partition.
+
+    @param left
+      Pointer to the P matrix of the left child
+
+    @param right
+      Pointer to the P matrix of the right child
+
+    @param wgt
+      Array of weights for each site
+
+    @param scalerIncrement
+      Where to store the number of scalings carried out in case \a fastScaling is set to \b PLL_TRUE.
+
+    @param fastScaling
+      If set to \b PLL_TRUE, only the total number of scalings for all sites of the partition will be
+      stored in \a scalerIncrement, otherwise per-site scalings are stored in the array \a ex3. 
+
+    @param states
+      Number of states for the particular data (4 for DNA or 20 for AA)
+ */
+static void newviewCAT_FLEX(int tipCase, double *extEV,
+                            int *cptr,
+                            double *x1, double *x2, double *x3, double *tipVector,
+                            int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                            int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling, const int states)
+{
+  double
+    *le, 
+    *ri, 
+    *v, 
+    *vl, 
+    *vr,
+    ump_x1, 
+    ump_x2, 
+    x1px2;
+
+  int 
+    i, 
+    l, 
+    j, 
+    scale, 
+    addScale = 0;
+
+  const int 
+    statesSquare = states * states;
+
+
+  /* here we switch over the different cases for efficiency, but also because 
+     each case accesses different data types.
+
+     We consider three cases: either q and r are both tips, q or r are tips, and q and r are inner 
+     nodes.
+     */
+
+
+  switch(tipCase)
+  {
+
+    /* both child nodes of p weher we want to update the conditional likelihood are tips */
+    case PLL_TIP_TIP:     
+      /* loop over sites */
+      for (i = 0; i < n; i++)
+      {
+        /* set a pointer to the P-Matrices for the rate category of this site */
+        le = &left[cptr[i] * statesSquare];
+        ri = &right[cptr[i] * statesSquare];
+
+        /* pointers to the likelihood entries of the tips q (vl) and r (vr) 
+           We will do reading accesses to these values only.
+           */
+        vl = &(tipVector[states * tipX1[i]]);
+        vr = &(tipVector[states * tipX2[i]]);
+
+        /* address of the conditional likelihood array entres at site i. This is 
+           a writing access to v */
+        v  = &x3[states * i];
+
+        /* initialize v */
+        for(l = 0; l < states; l++)
+          v[l] = 0.0;
+
+        /* loop over states to compute the cond likelihoods at p (v) */
+
+        for(l = 0; l < states; l++)
+        {             
+          ump_x1 = 0.0;
+          ump_x2 = 0.0;
+
+          /* le and ri are the P-matrices */
+
+          for(j = 0; j < states; j++)
+          {
+            ump_x1 += vl[j] * le[l * states + j];
+            ump_x2 += vr[j] * ri[l * states + j];
+          }
+
+          x1px2 = ump_x1 * ump_x2;
+
+          /* multiply with matrix of eigenvectors extEV */
+
+          for(j = 0; j < states; j++)
+            v[j] += x1px2 * extEV[l * states + j];
+        }          
+      }    
+      break;
+    case PLL_TIP_INNER:      
+
+      /* same as above, only that now vl is a tip and vr is the conditional probability vector 
+         at an inner node. Note that, if we have the case that either q or r is a tip, the 
+         nodes will be flipped to ensure that tipX1 always points to the sequence at the tip.
+         */
+
+      for (i = 0; i < n; i++)
+      {
+        le = &left[cptr[i] * statesSquare];
+        ri = &right[cptr[i] * statesSquare];
+
+        /* access tip vector lookup table */
+        vl = &(tipVector[states * tipX1[i]]);
+
+        /* access conditional likelihoo arrays */
+        /* again, vl and vr are reading accesses, while v is a writing access */
+        vr = &x2[states * i];
+        v  = &x3[states * i];
+
+        /* same as in the loop above */
+
+        for(l = 0; l < states; l++)
+          v[l] = 0.0;
+
+        for(l = 0; l < states; l++)
+        {
+          ump_x1 = 0.0;
+          ump_x2 = 0.0;
+
+          for(j = 0; j < states; j++)
+          {
+            ump_x1 += vl[j] * le[l * states + j];
+            ump_x2 += vr[j] * ri[l * states + j];
+          }
+
+          x1px2 = ump_x1 * ump_x2;
+
+          for(j = 0; j < states; j++)
+            v[j] += x1px2 * extEV[l * states + j];
+        }
+
+        /* now let's check for numerical scaling. 
+           The maths in RAxML are a bit non-standard to avoid/economize on arithmetic operations 
+           at the virtual root and for branch length optimization and hence values stored 
+           in the conditional likelihood vectors can become negative.
+           Below we check if all absolute values stored at position i of v are smaller 
+           than a pre-defined value in pll.h. If they are all smaller we can then safely 
+           multiply them by a large, constant number PLL_TWOTOTHE256 (without numerical overflow) 
+           that is also speced in pll.h */
+
+        scale = 1;
+        for(l = 0; scale && (l < states); l++)
+          scale = ((v[l] < PLL_MINLIKELIHOOD) && (v[l] > PLL_MINUSMINLIKELIHOOD));         
+
+        if(scale)
+        {
+          for(l = 0; l < states; l++)
+            v[l] *= PLL_TWOTOTHE256;
+
+          /* if we have scaled the entries to prevent underflow, we need to keep track of how many scaling 
+             multiplications we did per node such as to undo them at the virtual root, e.g., in 
+             evaluateGeneric() 
+             Note here, that, if we scaled the site we need to increment the scaling counter by the wieght, i.e., 
+             the number of sites this potentially compressed pattern represents ! */ 
+
+          if(!fastScaling)
+            ex3[i] += 1;
+          else
+            addScale += wgt[i];   
+          
+        }
+      }   
+      break;
+    case PLL_INNER_INNER:
+
+      /* same as above, only that the two child nodes q and r are now inner nodes */
+
+      for(i = 0; i < n; i++)
+      {
+        le = &left[cptr[i] * statesSquare];
+        ri = &right[cptr[i] * statesSquare];
+
+        /* index conditional likelihood vectors of inner nodes */
+
+        vl = &x1[states * i];
+        vr = &x2[states * i];
+        v = &x3[states * i];
+
+        for(l = 0; l < states; l++)
+          v[l] = 0.0;
+
+        for(l = 0; l < states; l++)
+        {
+          ump_x1 = 0.0;
+          ump_x2 = 0.0;
+
+          for(j = 0; j < states; j++)
+          {
+            ump_x1 += vl[j] * le[l * states + j];
+            ump_x2 += vr[j] * ri[l * states + j];
+          }
+
+          x1px2 =  ump_x1 * ump_x2;
+
+          for(j = 0; j < states; j++)
+            v[j] += x1px2 * extEV[l * states + j];            
+        }
+
+        scale = 1;
+        for(l = 0; scale && (l < states); l++)
+          scale = ((v[l] < PLL_MINLIKELIHOOD) && (v[l] > PLL_MINUSMINLIKELIHOOD));
+
+        if(scale)
+        {
+          for(l = 0; l < states; l++)
+            v[l] *= PLL_TWOTOTHE256;
+          
+          if(!fastScaling)
+            ex3[i] += 1;
+          else
+            addScale += wgt[i];    
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  /* increment the scaling counter by the additional scalings done at node p */
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+/** @brief Computation of conditional likelihood arrays for \b GAMMA
+ 
+    This is a generic, slow but readable function implementation for computing the 
+     conditional likelihood arrays at \a p, given child nodes \a q and \a r using the \b GAMMA
+     model of rate heterogeneity. Depending whether \a q, resp. \r, are tips or internal
+     nodes (indicated by \a tipCase) the conditional likelihoods are computed based on
+     \a x1 if \a q is an inner node or \a tipX1 if it is a tip, resp. \a x2 if \a r
+     is an inner node or \a tipX2 if it is a tip. Output array \a ex3 stores the
+     number of times the likelihood of each site for each internal node has been scaled.
+     The conditional likelihood vectors for any possible base-pair (which is useful when
+     \a q or \a r are tips) has been already precomputed from the eigenvalues of the Q
+     matrix in the array \a tipVector. In case the conditional likelihood for a particular
+     site is very small in terms of a floating point number, then it is multiplied by a
+     very large number (scaling), and then number of times it has been scaled (per node) is
+     stored in the array \a ex3, if \a fastScaling is set to \b PLL_FALSE. Otherwise, the
+     total number of scalings for all sites and all nodes is stored in a single variable
+     \a scalerIncrement.
+
+    @param tipCase
+      Can be either \b PLL_TIP_TIP, or \b PLL_TIP_INNER or \b PLL_INNER_INNER, and describes the
+      descendants of the node for which we currently compute the condition likelihood
+      vector, i.e. whether they are both tips (leaves), or one is tip and the other
+      an inner node, or both are inner nodes.
+
+    @param x1
+      Conditional likelihood vectors of the first child node, in case it is an internal node
+
+    @param x2
+      Conditional likelihood vectors of the second child node, in case it is an internal node
+
+    @param x3
+      Pointer to where the computed conditional likelihood vector of node \a p will be stored
+
+    @param extEV
+      Eigenvectors of Q matrix
+
+    @param tipVector
+      Vector contining sums of left eigenvectors for likelihood computation at tips.
+
+    @param ex3
+      Pointer to an array of whose elements correspond to the number of times the likelihood of
+      a particular site of a particular internal nodeis scaled. Those elements are incremented
+      at every scaling operation and only if \a fastScaling flag is set to \b PLL_FALSE. This 
+      array will be used later when evaluating the likelihood of the whole tree.
+
+    @param tipX1
+      Pointer to the alignment data (sequence) of first child node, in case it is a tip
+
+    @param tipX2
+      Pointer to the alignment data (sequence) of second child node, in case it is a tip
+
+    @param n
+      Number of sites to be processed
+
+    @param left
+      Pointer to the P matrix of the left child
+
+    @param right
+      Pointer to the P matrix of the right child
+
+    @param wgt
+      Array of weights for each site
+
+    @param scalerIncrement
+      Where to store the number of scalings carried out in case \a fastScaling is set to \b PLL_TRUE.
+
+    @param fastScaling
+      If set to \b PLL_TRUE, only the total number of scalings for all sites of the partition will be
+      stored in \a scalerIncrement, otherwise per-site scalings are stored in the array \a ex3. 
+
+    @param states
+      Number of states for the particular data (4 for DNA or 20 for AA)
+
+    @param maxStateValue
+      Number of all possible base-pairs including degenerate characters, i.e. 16 for  DNA and 23 for AA
+ */
+static void newviewGAMMA_FLEX(int tipCase,
+                              double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                              int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                              int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling, const int states, const int maxStateValue)
+{
+  double  
+    *uX1, 
+    *uX2, 
+    *v, 
+    x1px2, 
+    *vl, 
+    *vr, 
+    al, 
+    ar;
+
+  int  
+    i, 
+    j, 
+    l, 
+    k, 
+    scale, 
+    addScale = 0;
+
+  const int     
+    statesSquare = states * states,
+                 span = states * 4,
+                 /* this is required for doing some pre-computations that help to save 
+                    numerical operations. What we are actually computing here are additional lookup tables 
+                    for each possible state a certain data-type can assume.
+                    for DNA with ambuguity coding this is 15, for proteins this is 22 or 23, since there 
+                    also exist one or two amibguity codes for protein data.
+                    Essentially this is very similar to the tip vectors which we also use as lookup tables */
+                 precomputeLength = maxStateValue * span;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        /* allocate pre-compute memory space */
+
+        double 
+          *umpX1 = (double*)rax_malloc(sizeof(double) * precomputeLength),
+          *umpX2 = (double*)rax_malloc(sizeof(double) * precomputeLength);
+
+        /* multiply all possible tip state vectors with the respective P-matrices 
+        */
+
+        for(i = 0; i < maxStateValue; i++)
+        {
+          v = &(tipVector[states * i]);
+
+          for(k = 0; k < span; k++)
+          {
+
+            umpX1[span * i + k] = 0.0;
+            umpX2[span * i + k] = 0.0;
+
+            for(l = 0; l < states; l++)
+            {
+              umpX1[span * i + k] +=  v[l] *  left[k * states + l];
+              umpX2[span * i + k] +=  v[l] * right[k * states + l];
+            }
+
+          }
+        }
+
+        for(i = 0; i < n; i++)
+        {
+          /* access the precomputed arrays (pre-computed multiplication of conditional with the tip state) 
+          */
+
+          uX1 = &umpX1[span * tipX1[i]];
+          uX2 = &umpX2[span * tipX2[i]];
+
+          /* loop over discrete GAMMA rates */
+
+          for(j = 0; j < 4; j++)
+          {
+            /* the rest is the same as for CAT */
+            v = &x3[i * span + j * states];
+
+            for(k = 0; k < states; k++)
+              v[k] = 0.0;
+
+            for(k = 0; k < states; k++)
+            {              
+              x1px2 = uX1[j * states + k] * uX2[j * states + k];
+
+              for(l = 0; l < states; l++)                                                       
+                v[l] += x1px2 * extEV[states * k + l];               
+            }
+
+          }        
+        }
+
+        /* free precomputed vectors */
+
+        rax_free(umpX1);
+        rax_free(umpX2);
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        /* we do analogous pre-computations as above, with the only difference that we now do them 
+           only for one tip vector */
+
+        double 
+          *umpX1 = (double*)rax_malloc(sizeof(double) * precomputeLength),
+          *ump_x2 = (double*)rax_malloc(sizeof(double) * states);
+
+        /* precompute P and left tip vector product */
+
+        for(i = 0; i < maxStateValue; i++)
+        {
+          v = &(tipVector[states * i]);
+
+          for(k = 0; k < span; k++)
+          {
+
+            umpX1[span * i + k] = 0.0;
+
+            for(l = 0; l < states; l++)
+              umpX1[span * i + k] +=  v[l] * left[k * states + l];
+
+
+          }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+          /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
+
+          uX1 = &umpX1[span * tipX1[i]];
+
+          /* loop over discrete GAMMA rates */
+
+          for(k = 0; k < 4; k++)
+          {
+            v = &(x2[span * i + k * states]);
+
+            for(l = 0; l < states; l++)
+            {
+              ump_x2[l] = 0.0;
+
+              for(j = 0; j < states; j++)
+                ump_x2[l] += v[j] * right[k * statesSquare + l * states + j];
+            }
+
+            v = &(x3[span * i + states * k]);
+
+            for(l = 0; l < states; l++)
+              v[l] = 0;
+
+            for(l = 0; l < states; l++)
+            {
+              x1px2 = uX1[k * states + l]  * ump_x2[l];
+              for(j = 0; j < states; j++)
+                v[j] += x1px2 * extEV[l * states  + j];
+            }
+          }
+
+          /* also do numerical scaling as above. Note that here we need to scale 
+             4 * 4 values for DNA or 4 * 20 values for protein data.
+             If they are ALL smaller than our threshold, we scale. Note that,
+             this can cause numerical problems with GAMMA, if the values generated 
+             by the four discrete GAMMA rates are too different.
+
+             For details, see: 
+
+             F. Izquierdo-Carrasco, S.A. Smith, A. Stamatakis: "Algorithms, Data Structures, and Numerics for Likelihood-based Phylogenetic Inference of Huge Trees"
+
+*/
+
+
+          v = &x3[span * i];
+          scale = 1;
+          for(l = 0; scale && (l < span); l++)
+            scale = (PLL_ABS(v[l]) <  PLL_MINLIKELIHOOD);
+
+
+          if (scale)
+          {
+            for(l = 0; l < span; l++)
+              v[l] *= PLL_TWOTOTHE256;
+            
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];                   
+          }
+        }
+
+        rax_free(umpX1);
+        rax_free(ump_x2);
+      }
+      break;
+    case PLL_INNER_INNER:
+
+      /* same as above, without pre-computations */
+
+      for (i = 0; i < n; i++)
+      {
+        for(k = 0; k < 4; k++)
+        {
+          vl = &(x1[span * i + states * k]);
+          vr = &(x2[span * i + states * k]);
+          v =  &(x3[span * i + states * k]);
+
+
+          for(l = 0; l < states; l++)
+            v[l] = 0;
+
+
+          for(l = 0; l < states; l++)
+          {              
+
+            al = 0.0;
+            ar = 0.0;
+
+            for(j = 0; j < states; j++)
+            {
+              al += vl[j] * left[k * statesSquare + l * states + j];
+              ar += vr[j] * right[k * statesSquare + l * states + j];
+            }
+
+            x1px2 = al * ar;
+
+            for(j = 0; j < states; j++)
+              v[j] += x1px2 * extEV[states * l + j];
+
+          }
+        }
+
+        v = &(x3[span * i]);
+        scale = 1;
+        for(l = 0; scale && (l < span); l++)
+          scale = ((PLL_ABS(v[l]) <  PLL_MINLIKELIHOOD));
+
+        if(scale)
+        {  
+          for(l = 0; l < span; l++)
+            v[l] *= PLL_TWOTOTHE256;
+          
+          if(!fastScaling)
+            ex3[i] += 1;
+          else
+            addScale += wgt[i];           
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  /* as above, increment the global counter that counts scaling multiplications by the scaling multiplications 
+     carried out for computing the likelihood array at node p */
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+/* Candidate for deletion */
+/*
+static void newviewGTRCAT( int tipCase,  double *EV,  int *cptr,
+                           double *x1_start,  double *x2_start,  double *x3_start,  double *tipVector,
+                           int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                           int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling)
+{
+  double
+    *le,
+    *ri,
+    *x1, *x2, *x3;
+  double
+    ump_x1, ump_x2, x1px2[4];
+  int i, j, k, scale, addScale = 0;
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+        for (i = 0; i < n; i++)
+          {
+            x1 = &(tipVector[4 * tipX1[i]]);
+            x2 = &(tipVector[4 * tipX2[i]]);
+            x3 = &x3_start[4 * i];
+
+            le =  &left[cptr[i] * 16];
+            ri =  &right[cptr[i] * 16];
+
+            for(j = 0; j < 4; j++)
+              {
+                ump_x1 = 0.0;
+                ump_x2 = 0.0;
+                for(k = 0; k < 4; k++)
+                  {
+                    ump_x1 += x1[k] * le[j * 4 + k];
+                    ump_x2 += x2[k] * ri[j * 4 + k];
+                  }
+                x1px2[j] = ump_x1 * ump_x2;
+              }
+
+            for(j = 0; j < 4; j++)
+              x3[j] = 0.0;
+
+            for(j = 0; j < 4; j++)
+              for(k = 0; k < 4; k++)
+                x3[k] += x1px2[j] * EV[j * 4 + k];          
+          }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        for (i = 0; i < n; i++)
+          {
+            x1 = &(tipVector[4 * tipX1[i]]);
+            x2 = &x2_start[4 * i];
+            x3 = &x3_start[4 * i];
+
+            le =  &left[cptr[i] * 16];
+            ri =  &right[cptr[i] * 16];
+
+            for(j = 0; j < 4; j++)
+              {
+                ump_x1 = 0.0;
+                ump_x2 = 0.0;
+                for(k = 0; k < 4; k++)
+                  {
+                    ump_x1 += x1[k] * le[j * 4 + k];
+                    ump_x2 += x2[k] * ri[j * 4 + k];
+                  }
+                x1px2[j] = ump_x1 * ump_x2;
+              }
+
+            for(j = 0; j < 4; j++)
+              x3[j] = 0.0;
+
+            for(j = 0; j < 4; j++)
+              for(k = 0; k < 4; k++)
+                x3[k] +=  x1px2[j] *  EV[4 * j + k];       
+
+            scale = 1;
+            for(j = 0; j < 4 && scale; j++)
+              scale = (x3[j] < PLL_MINLIKELIHOOD && x3[j] > PLL_MINUSMINLIKELIHOOD);               
+                    
+            if(scale)
+              {             
+                for(j = 0; j < 4; j++)
+                  x3[j] *= PLL_TWOTOTHE256;
+                
+                if(useFastScaling)
+                  addScale += wgt[i];
+                else
+                  ex3[i]  += 1;         
+              }      
+          }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+        {
+          x1 = &x1_start[4 * i];
+          x2 = &x2_start[4 * i];
+          x3 = &x3_start[4 * i];
+
+          le = &left[cptr[i] * 16];
+          ri = &right[cptr[i] * 16];
+
+          for(j = 0; j < 4; j++)
+            {
+              ump_x1 = 0.0;
+              ump_x2 = 0.0;
+              for(k = 0; k < 4; k++)
+                {
+                  ump_x1 += x1[k] * le[j * 4 + k];
+                  ump_x2 += x2[k] * ri[j * 4 + k];
+                }
+              x1px2[j] = ump_x1 * ump_x2;
+            }
+
+          for(j = 0; j < 4; j++)
+            x3[j] = 0.0;
+
+          for(j = 0; j < 4; j++)
+            for(k = 0; k < 4; k++)
+              x3[k] +=  x1px2[j] *  EV[4 * j + k];
+        
+          scale = 1;
+          for(j = 0; j < 4 && scale; j++)
+            scale = (x3[j] < PLL_MINLIKELIHOOD && x3[j] > PLL_MINUSMINLIKELIHOOD);
+
+          if(scale)
+            {               
+              for(j = 0; j < 4; j++)
+                x3[j] *= PLL_TWOTOTHE256;
+              
+              if(useFastScaling)
+                addScale += wgt[i];
+              else
+                ex3[i]  += 1;           
+            }     
+        }
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+
+}
+*/
+#if 0
+static void newviewGTRGAMMA_BINARY(int tipCase,
+                                   double *x1_start, double *x2_start, double *x3_start,
+                                   double *EV, double *tipVector,
+                                   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                   const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling
+                                   )
+{
+  double
+    *x1, *x2, *x3;
+  double
+    ump_x1,
+    ump_x2,
+    x1px2[4];
+  int i, j, k, l, scale, addScale = 0;
+
+
+  /* C-OPT figure out if we are at an inner node who has two tips/leaves
+     as descendants TIP_TIP, a tip and another inner node as descendant
+     TIP_INNER, or two inner nodes as descendants INNER_INNER */
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+        for (i = 0; i < n; i++)
+          {
+            x1 = &(tipVector[2 * tipX1[i]]);
+            x2 = &(tipVector[2 * tipX2[i]]);
+            x3 = &x3_start[i * 8];
+
+            for(j = 0; j < 8; j++)
+              x3[j] = 0.0;
+
+            for (j = 0; j < 4; j++)
+              {
+                for (k = 0; k < 2; k++)
+                  {
+                    ump_x1 = 0.0;
+                    ump_x2 = 0.0;
+
+                    for (l=0; l < 2; l++)
+                      {
+                        ump_x1 += x1[l] * left[ j*4 + k*2 + l];
+                        ump_x2 += x2[l] * right[j*4 + k*2 + l];
+                      }
+
+                    x1px2[k] = ump_x1 * ump_x2;
+                  }
+
+                for(k = 0; k < 2; k++)
+                  for (l = 0; l < 2; l++)
+                    x3[j * 2 + l] +=  x1px2[k] * EV[2 * k + l];
+
+              }    
+          }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+         for (i = 0; i < n; i++)
+           {
+             x1 = &(tipVector[2 * tipX1[i]]);
+             x2 = &x2_start[i * 8];
+             x3 = &x3_start[i * 8];
+
+             for(j = 0; j < 8; j++)
+               x3[j] = 0.0;
+
+             for (j = 0; j < 4; j++)
+               {
+                 for (k = 0; k < 2; k++)
+                   {
+                     ump_x1 = 0.0;
+                     ump_x2 = 0.0;
+
+                     for (l=0; l < 2; l++)
+                       {
+                         ump_x1 += x1[l] * left[ j*4 + k*2 + l];
+                         ump_x2 += x2[j*2 + l] * right[j*4 + k*2 + l];
+                       }
+
+                     x1px2[k] = ump_x1 * ump_x2;
+                   }
+
+                 for(k = 0; k < 2; k++)
+                   for (l = 0; l < 2; l++)
+                     x3[j * 2 + l] +=  x1px2[k] * EV[2 * k + l];
+
+               }            
+
+             scale = 1;
+             for(l = 0; scale && (l < 8); l++)
+               scale = (PLL_ABS(x3[l]) <  PLL_MINLIKELIHOOD);
+
+             if(scale)
+               {
+                 for (l=0; l < 8; l++)
+                   x3[l] *= PLL_TWOTOTHE256;
+                 
+                 if(useFastScaling)
+                   addScale += wgt[i];
+                 else
+                   ex3[i]  += 1;               
+               }
+
+           }
+      }
+      break;
+    case PLL_INNER_INNER:
+
+      /* C-OPT here we don't do any pre-computations
+         This should be the most compute intensive loop of the three
+         cases here. If we have one or two tips as descendants
+         we can take a couple of shortcuts */
+
+
+     for (i = 0; i < n; i++)
+       {
+         x1 = &x1_start[i * 8];
+         x2 = &x2_start[i * 8];
+         x3 = &x3_start[i * 8];
+
+         for(j = 0; j < 8; j++)
+           x3[j] = 0.0;
+
+         for (j = 0; j < 4; j++)
+           {
+             for (k = 0; k < 2; k++)
+               {
+                 ump_x1 = 0.0;
+                 ump_x2 = 0.0;
+
+                 for (l=0; l < 2; l++)
+                   {
+                     ump_x1 += x1[j*2 + l] * left[ j*4 + k*2 + l];
+                     ump_x2 += x2[j*2 + l] * right[j*4 + k*2 + l];
+                   }
+
+                 x1px2[k] = ump_x1 * ump_x2;
+               }
+
+             for(k = 0; k < 2; k++)
+               for (l = 0; l < 2; l++)
+                 x3[j * 2 + l] +=  x1px2[k] * EV[2 * k + l];
+
+           }
+         
+         scale = 1;
+         for(l = 0; scale && (l < 8); l++)
+           scale = (PLL_ABS(x3[l]) <  PLL_MINLIKELIHOOD);
+
+
+         if(scale)
+           {
+             for (l=0; l<8; l++)
+               x3[l] *= PLL_TWOTOTHE256;
+
+             if(useFastScaling)
+               addScale += wgt[i];
+             else
+               ex3[i]  += 1;      
+           }
+       }
+     break;
+
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+
+}
+
+static void newviewGTRCAT_BINARY( int tipCase,  double *EV,  int *cptr,
+				  double *x1_start,  double *x2_start,  double *x3_start,  double *tipVector,
+				  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+				  int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling)
+{
+  double
+    *le,
+    *ri,
+    *x1, *x2, *x3;
+  double
+    ump_x1, ump_x2, x1px2[2];
+  int i, j, k, scale, addScale = 0;
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+	for (i = 0; i < n; i++)
+	  {
+	    x1 = &(tipVector[2 * tipX1[i]]);
+	    x2 = &(tipVector[2 * tipX2[i]]);
+	    x3 = &x3_start[2 * i];	    
+
+	    le =  &left[cptr[i] * 4];
+	    ri =  &right[cptr[i] * 4];
+
+	    for(j = 0; j < 2; j++)
+	      {
+		ump_x1 = 0.0;
+		ump_x2 = 0.0;
+		for(k = 0; k < 2; k++)
+		  {
+		    ump_x1 += x1[k] * le[j * 2 + k];
+		    ump_x2 += x2[k] * ri[j * 2 + k];
+		  }
+		x1px2[j] = ump_x1 * ump_x2;
+	      }
+
+	    for(j = 0; j < 2; j++)
+	      x3[j] = 0.0;
+
+	    for(j = 0; j < 2; j++)
+	      for(k = 0; k < 2; k++)
+		x3[k] += x1px2[j] * EV[j * 2 + k];	   
+	  }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+	for (i = 0; i < n; i++)
+	  {
+	    x1 = &(tipVector[2 * tipX1[i]]);
+	    x2 = &x2_start[2 * i];
+	    x3 = &x3_start[2 * i];
+	    
+	    le =  &left[cptr[i] * 4];
+	    ri =  &right[cptr[i] * 4];
+
+	    for(j = 0; j < 2; j++)
+	      {
+		ump_x1 = 0.0;
+		ump_x2 = 0.0;
+		for(k = 0; k < 2; k++)
+		  {
+		    ump_x1 += x1[k] * le[j * 2 + k];
+		    ump_x2 += x2[k] * ri[j * 2 + k];
+		  }
+		x1px2[j] = ump_x1 * ump_x2;
+	      }
+
+	    for(j = 0; j < 2; j++)
+	      x3[j] = 0.0;
+
+	    for(j = 0; j < 2; j++)
+	      for(k = 0; k < 2; k++)
+		x3[k] +=  x1px2[j] *  EV[2 * j + k];	   
+
+	    scale = 1;
+	    for(j = 0; j < 2 && scale; j++)
+	      scale = (x3[j] < PLL_MINLIKELIHOOD && x3[j] > PLL_MINUSMINLIKELIHOOD);
+
+	    if(scale)
+	      {
+		for(j = 0; j < 2; j++)
+		  x3[j] *= PLL_TWOTOTHE256;
+
+		if(useFastScaling)
+		  addScale += wgt[i];
+		else
+		  ex3[i]  += 1;	       
+	      }
+	  }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+	{
+	  x1 = &x1_start[2 * i];
+	  x2 = &x2_start[2 * i];
+	  x3 = &x3_start[2 * i];
+
+	  le = &left[cptr[i] * 4];
+	  ri = &right[cptr[i] * 4];
+
+	  for(j = 0; j < 2; j++)
+	    {
+	      ump_x1 = 0.0;
+	      ump_x2 = 0.0;
+	      for(k = 0; k < 2; k++)
+		{
+		  ump_x1 += x1[k] * le[j * 2 + k];
+		  ump_x2 += x2[k] * ri[j * 2 + k];
+		}
+	      x1px2[j] = ump_x1 * ump_x2;
+	    }
+
+	  for(j = 0; j < 2; j++)
+	    x3[j] = 0.0;
+
+	  for(j = 0; j < 2; j++)
+	    for(k = 0; k < 2; k++)
+	      x3[k] +=  x1px2[j] *  EV[2 * j + k];	  
+
+	  scale = 1;
+	  for(j = 0; j < 2 && scale; j++)
+	    scale = (x3[j] < PLL_MINLIKELIHOOD && x3[j] > PLL_MINUSMINLIKELIHOOD);
+
+	  if(scale)
+	    {
+	      for(j = 0; j < 2; j++)
+		x3[j] *= PLL_TWOTOTHE256;
+
+	      if(useFastScaling)
+		addScale += wgt[i];
+	      else
+		ex3[i]  += 1;	   
+	    }
+	}
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+
+}
+#endif    /* end if 0 */
+#endif
+
+#if (defined(__AVX) || defined(__SSE3))
+static void newviewGTRCAT_BINARY( int tipCase,  double *EV,  int *cptr,
+                                  double *x1_start,  double *x2_start,  double *x3_start,  double *tipVector,
+                                  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                  int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling)
+{
+  double
+    *le,
+    *ri,
+    *x1, *x2, *x3;
+  int i, l, scale, addScale = 0;
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+        for(i = 0; i < n; i++)
+          {
+            x1 = &(tipVector[2 * tipX1[i]]);
+            x2 = &(tipVector[2 * tipX2[i]]);
+            x3 = &x3_start[2 * i];         
+
+            le =  &left[cptr[i] * 4];
+            ri =  &right[cptr[i] * 4];
+
+            _mm_store_pd(x3, _mm_setzero_pd());     
+                     
+            for(l = 0; l < 2; l++)
+              {                                                                                                                          
+                __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
+                __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
+                
+                al = _mm_hadd_pd(al, al);
+                ar = _mm_hadd_pd(ar, ar);
+                
+                al = _mm_mul_pd(al, ar);
+                
+                __m128d vv  = _mm_load_pd(x3);
+                __m128d EVV = _mm_load_pd(&EV[2 * l]);
+                
+                vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+                
+                _mm_store_pd(x3, vv);                                                     
+              }            
+          }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        for (i = 0; i < n; i++)
+          {
+            x1 = &(tipVector[2 * tipX1[i]]);
+            x2 = &x2_start[2 * i];
+            x3 = &x3_start[2 * i];
+            
+            le =  &left[cptr[i] * 4];
+            ri =  &right[cptr[i] * 4];
+
+            _mm_store_pd(x3, _mm_setzero_pd());     
+                     
+            for(l = 0; l < 2; l++)
+              {                                                                                                                          
+                __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
+                __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
+                
+                al = _mm_hadd_pd(al, al);
+                ar = _mm_hadd_pd(ar, ar);
+                
+                al = _mm_mul_pd(al, ar);
+                
+                __m128d vv  = _mm_load_pd(x3);
+                __m128d EVV = _mm_load_pd(&EV[2 * l]);
+                
+                vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+                
+                _mm_store_pd(x3, vv);                                                     
+              }  
+            
+            __m128d minlikelihood_sse = _mm_set1_pd(PLL_MINLIKELIHOOD);
+         
+            scale = 1;
+            
+            __m128d v1 = _mm_and_pd(_mm_load_pd(x3), absMask.m);
+            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(_mm_movemask_pd( v1 ) != 3)
+              scale = 0;                         
+            
+            if(scale)
+              {
+                __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+                
+                __m128d ex3v = _mm_load_pd(x3);           
+                _mm_store_pd(x3, _mm_mul_pd(ex3v,twoto));                                                 
+                
+                if(useFastScaling)
+                  addScale += wgt[i];
+                else
+                  ex3[i]  += 1;   
+              }                    
+          }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+        {
+          x1 = &x1_start[2 * i];
+          x2 = &x2_start[2 * i];
+          x3 = &x3_start[2 * i];
+
+          le = &left[cptr[i] * 4];
+          ri = &right[cptr[i] * 4];
+
+          _mm_store_pd(x3, _mm_setzero_pd());       
+          
+          for(l = 0; l < 2; l++)
+            {                                                                                                                            
+              __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
+              __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
+              
+              al = _mm_hadd_pd(al, al);
+              ar = _mm_hadd_pd(ar, ar);
+              
+              al = _mm_mul_pd(al, ar);
+              
+              __m128d vv  = _mm_load_pd(x3);
+              __m128d EVV = _mm_load_pd(&EV[2 * l]);
+              
+              vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+              
+              _mm_store_pd(x3, vv);                                                       
+            }                             
+
+          __m128d minlikelihood_sse = _mm_set1_pd(PLL_MINLIKELIHOOD);
+         
+          scale = 1;
+                  
+          __m128d v1 = _mm_and_pd(_mm_load_pd(x3), absMask.m);
+          v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+          if(_mm_movemask_pd( v1 ) != 3)
+            scale = 0;                   
+         
+          if(scale)
+            {
+              __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+                    
+              __m128d ex3v = _mm_load_pd(x3);             
+              _mm_store_pd(x3, _mm_mul_pd(ex3v,twoto));                                           
+             
+              if(useFastScaling)
+                addScale += wgt[i];
+              else
+                ex3[i]  += 1;     
+           }             
+        }
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+
+}
+
+static void newviewGTRGAMMA_BINARY(int tipCase,
+				   double *x1_start, double *x2_start, double *x3_start,
+				   double *EV, double *tipVector,
+				   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+				   const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling
+				   )
+{
+  double
+    *x1, *x2, *x3;
+ 
+  int i, k, l, scale, addScale = 0; 
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      for (i = 0; i < n; i++)
+       {
+	 x1  = &(tipVector[2 * tipX1[i]]);
+	 x2  = &(tipVector[2 * tipX2[i]]);
+	 
+	 for(k = 0; k < 4; k++)
+	   {	     	     	    
+	     x3 = &(x3_start[8 * i + 2 * k]);	     
+	    	         
+	     _mm_store_pd(x3, _mm_setzero_pd());	    
+	    	     
+	     for(l = 0; l < 2; l++)
+	       {		 		 						   		  		 		 
+		 __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
+		 __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
+		 		       
+		 al = _mm_hadd_pd(al, al);
+		 ar = _mm_hadd_pd(ar, ar);
+		   
+		 al = _mm_mul_pd(al, ar);
+		   
+		 __m128d vv  = _mm_load_pd(x3);
+		 __m128d EVV = _mm_load_pd(&EV[2 * l]);
+		 
+		 vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		 
+		 _mm_store_pd(x3, vv);		     	  		   		  
+	       }	     	    
+	   }
+       }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+       {
+	 x1  = &(tipVector[2 * tipX1[i]]);
+	 
+	 for(k = 0; k < 4; k++)
+	   {	     	     
+	     x2 = &(x2_start[8 * i + 2 * k]);
+	     x3 = &(x3_start[8 * i + 2 * k]);	     
+	    	         
+	     _mm_store_pd(x3, _mm_setzero_pd());	    
+	    	     
+	     for(l = 0; l < 2; l++)
+	       {		 		 						   		  		 		 
+		 __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
+		 __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
+		 		       
+		 al = _mm_hadd_pd(al, al);
+		 ar = _mm_hadd_pd(ar, ar);
+		   
+		 al = _mm_mul_pd(al, ar);
+		   
+		 __m128d vv  = _mm_load_pd(x3);
+		 __m128d EVV = _mm_load_pd(&EV[2 * l]);
+		 
+		 vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		 
+		 _mm_store_pd(x3, vv);		     	  		   		  
+	       }	     	    
+	   }
+	
+	 x3 = &(x3_start[8 * i]);
+	 __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+	 
+	 scale = 1;
+	 for(l = 0; scale && (l < 8); l += 2)
+	   {
+	     __m128d vv = _mm_load_pd(&x3[l]);
+	     __m128d v1 = _mm_and_pd(vv, absMask.m);
+	     v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+	     if(_mm_movemask_pd( v1 ) != 3)
+	       scale = 0;
+	   }	    	         
+	 
+	 if(scale)
+	   {
+	     __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+	     
+	     for(l = 0; l < 8; l+=2)
+	       {
+		 __m128d ex3v = _mm_load_pd(&x3[l]);		  
+		 _mm_store_pd(&x3[l], _mm_mul_pd(ex3v,twoto));	
+	       }		   		  
+	     
+	     if(useFastScaling)
+	       addScale += wgt[i];
+	     else
+	       ex3[i]  += 1;	  
+	   }	 
+       }      
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+       {	 
+	 for(k = 0; k < 4; k++)
+	   {	     
+	     x1 = &(x1_start[8 * i + 2 * k]);
+	     x2 = &(x2_start[8 * i + 2 * k]);
+	     x3 = &(x3_start[8 * i + 2 * k]);	     
+	    	         
+	     _mm_store_pd(x3, _mm_setzero_pd());	    
+	    	     
+	     for(l = 0; l < 2; l++)
+	       {		 		 						   		  		 		 
+		 __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
+		 __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
+		 		       
+		 al = _mm_hadd_pd(al, al);
+		 ar = _mm_hadd_pd(ar, ar);
+		   
+		 al = _mm_mul_pd(al, ar);
+		   
+		 __m128d vv  = _mm_load_pd(x3);
+		 __m128d EVV = _mm_load_pd(&EV[2 * l]);
+		 
+		 vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		 
+		 _mm_store_pd(x3, vv);		     	  		   		  
+	       }	     	    
+	   }
+	
+	 x3 = &(x3_start[8 * i]);
+	 __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+	 
+	 scale = 1;
+	 for(l = 0; scale && (l < 8); l += 2)
+	   {
+	     __m128d vv = _mm_load_pd(&x3[l]);
+	     __m128d v1 = _mm_and_pd(vv, absMask.m);
+	     v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+	     if(_mm_movemask_pd( v1 ) != 3)
+	       scale = 0;
+	   }	    	         
+	 
+	 if(scale)
+	   {
+	     __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+	     
+	     for(l = 0; l < 8; l+=2)
+	       {
+		 __m128d ex3v = _mm_load_pd(&x3[l]);		  
+		 _mm_store_pd(&x3[l], _mm_mul_pd(ex3v,twoto));	
+	       }		   		  
+	     
+	     if(useFastScaling)
+	       addScale += wgt[i];
+	     else
+	       ex3[i]  += 1;	  
+	   }	 
+       }
+      break;
+
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+
+}
+
+
+#endif
+
+
+
+
+/* The function below computes partial traversals only down to the point/node in the tree where the 
+   conditional likelihhod vector summarizing a subtree is already oriented in the correct direction */
+
+
+/** @brief Compute a partial or full traversal descriptor for a subtree of the topology
+
+   Unless the \a partialTraversal is set to \b PLL_TRUE, compute a partial traversal descriptor down 
+   to the point/node in the tree where the conditional likelihood vector representing a subtree is
+   already oriented in the correct direction. The elements of the traversal descriptor are stored in
+   \a ti and a \a counter keeps track of the number of elements.
+
+   @param p
+     Root of the  subtree for which we want to compute the traversal descriptor. The two descendents are \a p->next->back and \a p->next->next->back
+
+   @param ti
+i    Traversal descriptor element structure
+
+   @param counter
+     Number of elements in the traversal descriptor. Updated when an element is added
+
+   @param maxTips
+     Number of tips in the tree structure
+
+   @param numBranches
+     Number of branches
+   
+   @param partialTraversal
+     If \b PLL_TRUE, a partial traversal descriptor is computed, otherwise a full
+
+   @param rvec
+     Parameter concerning ancestral state recomputation. Please document
+
+   @param useRecom
+     If \b PLL_TRUE, then ancestral state recomputation is enabled.
+   
+   @todo Fill in the ancestral recomputation parameter information 
+ */
+static void computeTraversalInfo(nodeptr p, traversalInfo *ti, int *counter, int maxTips, int numBranches, pllBoolean partialTraversal, recompVectors *rvec, pllBoolean useRecom)
+{
+  /* if it's a tip we don't do anything */
+
+  if(isTip(p->number, maxTips))
+    return;
+
+  {
+    int 
+      i;
+
+    /* recom default values */
+    int slot = -1,
+        unpin1 = -1, 
+        unpin2 = -1;
+    /* get the left and right descendants */
+
+    nodeptr 
+      q = p->next->back,
+        r = p->next->next->back;   
+
+    /* if the left and right children are tips there is not that much to do */
+    if(isTip(r->number, maxTips) && isTip(q->number, maxTips))
+    {
+      /* fix the orientation of p->x */
+
+      if (! p->x)
+        getxnode(p);    
+      
+      assert(p->x);
+
+      /* add the current node triplet p,q,r to the traversal descriptor */
+      ti[*counter].tipCase = PLL_TIP_TIP;
+      ti[*counter].pNumber = p->number;
+      ti[*counter].qNumber = q->number;
+      ti[*counter].rNumber = r->number;
+
+
+      /* copy branches to traversal descriptor */
+      for(i = 0; i < numBranches; i++)
+      {     
+        ti[*counter].qz[i] = q->z[i];
+        ti[*counter].rz[i] = r->z[i];
+      }
+
+      /* recom - add the slot to the traversal descriptor */
+      if(useRecom)
+      {
+        getxVector(rvec, p->number, &slot, maxTips);
+        ti[*counter].slot_p = slot;
+        ti[*counter].slot_q = -1;
+        ti[*counter].slot_r = -1;
+      }
+
+      /* increment length counter */
+
+      *counter = *counter + 1;
+    }
+    else
+    {
+      /* if either r or q are tips, flip them to make sure that the tip data is stored 
+         for q */
+      if(isTip(r->number, maxTips) || isTip(q->number, maxTips))
+      {     
+        if(isTip(r->number, maxTips))
+        {
+          nodeptr 
+            tmp = r;
+          r = q;
+          q = tmp;
+        }
+
+
+        /* if the orientation of the liklihood vector at r is not correct we need to re-compute it 
+           and descend into its subtree to figure out if there are more vrctors in there to re-compute and 
+           re-orient */
+
+        if(needsRecomp(useRecom, rvec, r, maxTips) || !partialTraversal) 
+          computeTraversalInfo(r, ti, counter, maxTips, numBranches, partialTraversal, rvec, useRecom);
+        else
+          {
+            if(useRecom)
+              /* the node is available,  now make sure it will not be unpinned until it is read */
+              protectNode(rvec, r->number, maxTips);
+          }
+        /* Now that r is oriented, we can safely set the orientation of p */
+        if(! p->x)
+          getxnode(p);   
+
+        /* make sure that everything is consistent now */
+
+        assert(p->x && r->x);
+
+        /* store data for p, q, r in the traversal descriptor */
+
+        ti[*counter].tipCase = PLL_TIP_INNER;
+        ti[*counter].pNumber = p->number;
+        ti[*counter].qNumber = q->number;
+        ti[*counter].rNumber = r->number;
+
+        for(i = 0; i < numBranches; i++)
+        {       
+          ti[*counter].qz[i] = q->z[i];
+          ti[*counter].rz[i] = r->z[i];
+        }
+
+        if(useRecom)
+        {
+          getxVector(rvec, r->number, &slot, maxTips);
+          ti[*counter].slot_r = slot;
+
+          getxVector(rvec, p->number, &slot, maxTips);
+          ti[*counter].slot_p = slot;
+
+          ti[*counter].slot_q = -1;
+
+          unpin2 = r->number; /* when PLL_TIP_INNER finishes, the INNER input vector r can be unpinned*/
+        }
+
+        *counter = *counter + 1;
+      }
+      else
+      {
+        /* same as above, only now q and r are inner nodes. Hence if they are not 
+           oriented correctly they will need to be recomputed and we need to descend into the 
+           respective subtrees to check if everything is consistent in there, potentially expanding 
+           the traversal descriptor */
+        if(( useRecom && (!partialTraversal) ) || 
+            ( useRecom && needsRecomp(useRecom, rvec, q, maxTips) && needsRecomp(useRecom, rvec, r, maxTips) ))
+        {
+          /* PLL_INNER_INNER and recomputation implies that the order we descend q and r matters, 
+           * if we are in a partial traversal, this is only relevant if both require recomputation
+           * see TODOFER add ref. */
+
+          int q_stlen = rvec->stlen[q->number - maxTips - 1],
+              r_stlen = rvec->stlen[q->number - maxTips - 1];
+          assert(q_stlen >= 2 && q_stlen <= maxTips - 1);
+          assert(r_stlen >= 2 && r_stlen <= maxTips - 1);
+
+          if(q_stlen > r_stlen)
+          {
+            computeTraversalInfo(q, ti, counter, maxTips, numBranches, partialTraversal, rvec, useRecom);
+            computeTraversalInfo(r, ti, counter, maxTips, numBranches, partialTraversal, rvec, useRecom);
+          }
+          else
+          {
+            computeTraversalInfo(r, ti, counter, maxTips, numBranches, partialTraversal, rvec, useRecom);
+            computeTraversalInfo(q, ti, counter, maxTips, numBranches, partialTraversal, rvec, useRecom);
+          }
+        }
+        else
+        {
+          /* Now the order does not matter */
+          /* If we are in a recomputation and partial, only either q or r will be descended */
+
+          if(!partialTraversal || needsRecomp(useRecom, rvec, q, maxTips))
+            computeTraversalInfo(q, ti, counter, maxTips, numBranches, partialTraversal, rvec, useRecom);
+          else
+          {
+            if(useRecom)
+              /* the node is available,  now make sure it will not be unpinned until it is read */
+              protectNode(rvec, q->number, maxTips);
+          }
+
+          if(!partialTraversal || needsRecomp(useRecom, rvec, r, maxTips))
+            computeTraversalInfo(r, ti, counter, maxTips, numBranches, partialTraversal, rvec, useRecom);
+          else
+          {
+            if(useRecom)
+              protectNode(rvec, r->number, maxTips);
+          }
+        }
+
+
+        if(! p->x)
+          getxnode(p);
+
+        /* check that the vector orientations are consistent now */
+
+        assert(p->x && r->x && q->x);
+
+        ti[*counter].tipCase = PLL_INNER_INNER;
+        ti[*counter].pNumber = p->number;
+        ti[*counter].qNumber = q->number;
+        ti[*counter].rNumber = r->number;
+
+        if(useRecom)
+        {
+          /* We check that the strategy cannot re-use slots */
+          getxVector(rvec, q->number, &slot, maxTips);
+          ti[*counter].slot_q = slot;
+
+          getxVector(rvec, r->number, &slot, maxTips);
+          ti[*counter].slot_r = slot;
+          assert(slot != ti[*counter].slot_q);
+
+          getxVector(rvec, p->number, &slot, maxTips);
+          ti[*counter].slot_p = slot;
+          assert(slot != ti[*counter].slot_q);
+          assert(slot != ti[*counter].slot_r);
+
+          /* And at these point both input INNER can be marked as unpinned */
+          unpin2 = r->number;
+          unpin1 = q->number;
+        }
+
+        for(i = 0; i < numBranches; i++)
+        {       
+          ti[*counter].qz[i] = q->z[i];
+          ti[*counter].rz[i] = r->z[i];
+        }
+
+        *counter = *counter + 1;
+      }
+    }
+    if(useRecom)
+    {
+      /* Mark the nodes as unpinnable(will be unpinned while executing the replacement strategy only if required)*/
+      unpinNode(rvec, unpin1, maxTips);
+      unpinNode(rvec, unpin2, maxTips);
+    }
+  }
+}
+
+/* below are the optimized unrolled, and vectorized versions of the above generi cfunctions 
+   for computing the conditional likelihood at p given child nodes q and r. The actual implementation is located at the end/bottom of this 
+   file.
+   */
+/* now this is the function that just iterates over the length of the traversal descriptor and 
+   just computes the conditional likelihhod arrays in the order given by the descriptor.
+   So in a sense, this function has no clue that there is any tree-like structure 
+   in the traversal descriptor, it just operates on an array of structs of given length */ 
+
+
+/** @brief Compute the conditional likelihood for each entry (node) of the traversal descriptor
+
+    Computes the conditional likelihood vectors for each entry (node) in the already computed
+    traversal descriptor, starting from the \a startIndex entry.
+     
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param startIndex
+      From which node to start computing the conditional likelihood vectors in the traversal
+      descriptor
+     
+    @note This function just iterates over the length of the traversal descriptor and 
+      computes the conditional likelihhod arrays in the order given by the descriptor.
+      So in a sense, this function has no clue that there is any tree-like structure 
+      in the traversal descriptor, it just operates on an array of structs of given length.
+ */
+void pllNewviewIterative (pllInstance *tr, partitionList *pr, int startIndex)
+{
+  traversalInfo 
+    *ti   = tr->td[0].ti;
+
+  int 
+    i, 
+    model;
+
+  int 
+    p_slot = -1, 
+    q_slot = -1, 
+    r_slot = -1;
+
+#ifdef _DEBUG_RECOMPUTATION
+  /* recom */
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+#else
+  countTraversal(tr);
+#endif
+  /* E recom */
+#endif
+
+  /* loop over traversal descriptor length. Note that on average we only re-compute the conditionals on 3 -4 
+     nodes in RAxML */
+
+  for(i = startIndex; i < tr->td[0].count; i++)
+  {
+
+    traversalInfo 
+      *tInfo = &ti[i];
+    
+    /* Note that the slots refer to different things if recomputation is applied */
+    if(tr->useRecom)
+      {
+        /* a slot has been assigned while computing the traversal descriptor  */
+        p_slot = tInfo->slot_p;
+        q_slot = tInfo->slot_q;
+        r_slot = tInfo->slot_r;
+      }
+    else
+      {
+        /* a fixed slot is always given for each inner node, we only need an offset to get the right index */
+        p_slot = tInfo->pNumber - tr->mxtips - 1;
+        q_slot = tInfo->qNumber - tr->mxtips - 1;
+        r_slot = tInfo->rNumber - tr->mxtips - 1;
+      }
+
+    /* now loop over all partitions for nodes p, q, and r of the current traversal vector entry */
+
+    for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      /* number of sites in this partition */
+      size_t            
+        width  = (size_t)pr->partitionData[model]->width;
+
+      /* this conditional statement is exactly identical to what we do in pllEvaluateIterative */
+
+      if(tr->td[0].executeModel[model] && width > 0)
+      {       
+        double
+          *x1_start = (double*)NULL,
+          *x2_start = (double*)NULL,
+          *x3_start = pr->partitionData[model]->xVector[p_slot],
+          *left     = (double*)NULL,
+          *right    = (double*)NULL,            
+#if (defined(__SSE3) || defined(__AVX))
+          *x1_gapColumn = (double*)NULL,
+          *x2_gapColumn = (double*)NULL,
+          *x3_gapColumn = (double*)NULL,
+#endif
+          *rateCategories = (double*)NULL,
+          *x1_ascColumn = NULL,
+          *x2_ascColumn = NULL,
+          *x3_ascColumn = NULL;
+
+        int
+          categories,
+          scalerIncrement = 0,
+
+          /* integer wieght vector with pattern compression weights */
+
+          *wgt = pr->partitionData[model]->wgt;
+
+        /* pointers for per-site scaling array at node p */
+        
+        int      
+          *ex3     = NULL,
+          *ex3_asc = NULL;
+
+        /* select fastScaling or per-site scaling of conidtional likelihood entries */
+
+        pllBoolean
+          fastScaling = tr->fastScaling;
+
+#if (defined(__SSE3) || defined(__AVX))
+        unsigned int
+          *x1_gap = (unsigned int*)NULL,
+          *x2_gap = (unsigned int*)NULL,
+          *x3_gap = (unsigned int*)NULL;
+#endif
+
+        unsigned char
+          *tipX1 = (unsigned char *)NULL,
+          *tipX2 = (unsigned char *)NULL;
+
+        double 
+          qz, 
+          rz;        
+
+        size_t
+#if (defined(__SSE3) || defined(__AVX))
+          gapOffset = 0,
+#endif
+          rateHet = discreteRateCategories(tr->rateHetModel),
+          ascWidth = (size_t)pr->partitionData[model]->states,
+
+          /* get the number of states in the data stored in partition model */
+          
+          states = (size_t)pr->partitionData[model]->states,
+          
+          /* get the length of the current likelihood array stored at node p. This is 
+             important mainly for the SEV-based memory saving option described in here:
+             
+             F. Izquierdo-Carrasco, S.A. Smith, A. Stamatakis: "Algorithms, Data Structures, and Numerics for Likelihood-based Phylogenetic Inference of Huge Trees".
+             
+             So pr->partitionData[model]->xSpaceVector[i] provides the length of the allocated conditional array of partition model
+             and node i 
+          */
+          
+          availableLength = pr->partitionData[model]->xSpaceVector[p_slot],
+          requiredLength = 0;        
+        
+        /* figure out what kind of rate heterogeneity approach we are using */
+
+        if(tr->rateHetModel == PLL_CAT)
+          {              
+            rateCategories = pr->partitionData[model]->perSiteRates;
+            categories = pr->partitionData[model]->numberOfCategories;
+          }
+        else
+          {                              
+            rateCategories = pr->partitionData[model]->gammaRates;
+            categories = 4;
+          }
+
+        /* memory saving stuff, not important right now, but if you are interested ask Fernando */
+
+#if (defined(__SSE3) || defined(__AVX))
+        if(tr->saveMemory)
+          {
+            size_t
+              j,
+              setBits = 0;                
+            
+            gapOffset = states * (size_t)getUndetermined(pr->partitionData[model]->dataType);
+            
+            x1_gap = &(pr->partitionData[model]->gapVector[tInfo->qNumber * pr->partitionData[model]->gapVectorLength]);
+            x2_gap = &(pr->partitionData[model]->gapVector[tInfo->rNumber * pr->partitionData[model]->gapVectorLength]);
+            x3_gap = &(pr->partitionData[model]->gapVector[tInfo->pNumber * pr->partitionData[model]->gapVectorLength]);
+            
+            for(j = 0; j < (size_t)pr->partitionData[model]->gapVectorLength; j++)
+              {              
+                x3_gap[j] = x1_gap[j] & x2_gap[j];
+                setBits += (size_t)(bitcount_32_bit(x3_gap[j])); 
+              }
+            
+            requiredLength = (width - setBits)  * rateHet * states * sizeof(double);            
+          }
+        else
+#endif
+          {
+            /* if we are not trying to save memory the space required to store an inner likelihood array 
+               is the number of sites in the partition times the number of states of the data type in the partition 
+               times the number of discrete GAMMA rates (1 for CAT essentially) times 8 bytes */
+            requiredLength  =  virtual_width( width ) * rateHet * states * sizeof(double);
+            
+            //                   printf( "req: %d %d %d %d\n", requiredLength, width, virtual_width(width), model );
+          }
+        
+        /* Initially, even when not using memory saving no space is allocated for inner likelihood arrats hence 
+           availableLength will be zero at the very first time we traverse the tree.
+           Hence we need to allocate something here */
+
+        if(requiredLength != availableLength)
+          {               
+            /* if there is a vector of incorrect length assigned here i.e., x3 != NULL we must free 
+               it first */
+            if(x3_start)
+              rax_free(x3_start);
+            
+            /* allocate memory: note that here we use a byte-boundary aligned malloc, because we need the vectors
+               to be aligned at 16 BYTE (SSE3) or 32 BYTE (AVX) boundaries! */
+            
+            rax_posix_memalign ((void **)&x3_start, PLL_BYTE_ALIGNMENT, requiredLength);              
+            
+            /* update the data structures for consistent bookkeeping */
+            pr->partitionData[model]->xVector[p_slot]      = x3_start;
+            pr->partitionData[model]->xSpaceVector[p_slot] = requiredLength;
+          }
+        
+
+        /* 
+           if we are not using fast scaling, we need to assign memory for storing 
+           integer vectors at each inner node that are as long as the sites of the 
+           partition. IMPORTANT: while this looks as if this might be a memory saving trick 
+           it is not. The ex3 vectors will be allocated once during the very first tree 
+           traversal and then never again because they will always have the required length!
+        */
+
+        if(!fastScaling)
+          {
+            size_t
+              availableExpLength = pr->partitionData[model]->expSpaceVector[p_slot],
+              requiredExpLength  = width * sizeof(int);
+            
+            ex3 = pr->partitionData[model]->expVector[p_slot];
+            
+            if(requiredExpLength != availableExpLength)
+              {
+                if(ex3)
+                  rax_free(ex3);
+                
+                rax_posix_memalign ((void **)&ex3, PLL_BYTE_ALIGNMENT, requiredExpLength);               
+                
+                pr->partitionData[model]->expVector[p_slot] = ex3;
+                
+                pr->partitionData[model]->expSpaceVector[p_slot] = requiredExpLength;
+              }
+          }
+
+        /* now just set the pointers for data accesses in the newview() implementations above to the corresponding values 
+           according to the tip case */
+        
+        switch(tInfo->tipCase)
+          {
+          case PLL_TIP_TIP:           
+            tipX1    = pr->partitionData[model]->yVector[tInfo->qNumber];
+            tipX2    = pr->partitionData[model]->yVector[tInfo->rNumber];
+
+#if (defined(__SSE3) || defined(__AVX))
+            if(tr->saveMemory)
+              {
+                x1_gapColumn   = &(pr->partitionData[model]->tipVector[gapOffset]);
+                x2_gapColumn   = &(pr->partitionData[model]->tipVector[gapOffset]);
+                x3_gapColumn   = &(pr->partitionData[model]->gapColumn[(tInfo->pNumber - tr->mxtips - 1) * states * rateHet]);
+              }
+#endif            
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+            if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+            if(pr->partitionData[model]->ascBias)
+#endif
+             {
+              size_t
+                k;
+              
+              x3_ascColumn = &pr->partitionData[model]->ascVector[(tInfo->pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+              ex3_asc      = &pr->partitionData[model]->ascExpVector[(tInfo->pNumber - tr->mxtips - 1) * ascWidth];
+
+              for(k = 0; k < ascWidth; k++)
+                ex3_asc[k] = 0;               
+             }
+            /* if we do per-site log likelihood scaling, and both child nodes are tips,
+               just initialize the vector with zeros, i.e., no scaling events */
+
+            if(!fastScaling)
+              {
+                size_t
+                  k;                                 
+
+                for(k = 0; k < width; k++)
+                  ex3[k] = 0;
+              }
+            break;
+          case PLL_TIP_INNER:                
+            tipX1    =  pr->partitionData[model]->yVector[tInfo->qNumber];
+            x2_start = pr->partitionData[model]->xVector[r_slot];
+            assert(r_slot != p_slot);
+            
+#if (defined(__SSE3) || defined(__AVX))
+            if(tr->saveMemory)
+              { 
+                x1_gapColumn   = &(pr->partitionData[model]->tipVector[gapOffset]);
+                x2_gapColumn   = &pr->partitionData[model]->gapColumn[(tInfo->rNumber - tr->mxtips - 1) * states * rateHet];
+                x3_gapColumn   = &pr->partitionData[model]->gapColumn[(tInfo->pNumber - tr->mxtips - 1) * states * rateHet];
+              }
+#endif
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+            if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+              if(pr->partitionData[model]->ascBias)
+#endif      
+              {   
+                size_t
+                  k;
+
+                int 
+                  *ex2_asc;
+                
+                x2_ascColumn = &pr->partitionData[model]->ascVector[(tInfo->rNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                x3_ascColumn = &pr->partitionData[model]->ascVector[(tInfo->pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                
+                ex2_asc = &pr->partitionData[model]->ascExpVector[(tInfo->rNumber - tr->mxtips - 1) * ascWidth];
+                ex3_asc = &pr->partitionData[model]->ascExpVector[(tInfo->pNumber - tr->mxtips - 1) * ascWidth];
+
+                for(k = 0; k < ascWidth; k++)
+                  ex3_asc[k] = ex2_asc[k];
+              }
+            
+            /* if one child node is not a tip, just copy the values from there, coudl also be done with memcpy of course 
+               the elements of ex3[] will then potentially be further incremented in the actual newview() if scaling events 
+               take place */
+
+            if(!fastScaling)
+              {
+                size_t 
+                  k;
+                int
+                  *ex2 = pr->partitionData[model]->expVector[r_slot];                
+                      
+                for(k = 0; k < width; k++)
+                  ex3[k] = ex2[k];
+              }
+            break;
+          case PLL_INNER_INNER:                              
+            x1_start       = pr->partitionData[model]->xVector[q_slot];
+            x2_start       = pr->partitionData[model]->xVector[r_slot];
+            assert(r_slot != p_slot);
+            assert(q_slot != p_slot);
+            assert(q_slot != r_slot);
+            
+#if (defined(__SSE3) || defined(__AVX))
+            if(tr->saveMemory)
+              {
+                x1_gapColumn   = &pr->partitionData[model]->gapColumn[(tInfo->qNumber - tr->mxtips - 1) * states * rateHet];
+                x2_gapColumn   = &pr->partitionData[model]->gapColumn[(tInfo->rNumber - tr->mxtips - 1) * states * rateHet];
+                x3_gapColumn   = &pr->partitionData[model]->gapColumn[(tInfo->pNumber - tr->mxtips - 1) * states * rateHet];
+              }
+#endif
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+              if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+              if(pr->partitionData[model]->ascBias)
+#endif          
+               {                
+                 size_t
+                   k;
+
+                 int 
+                   *ex1_asc,
+                   *ex2_asc;
+                 
+                 x1_ascColumn = &pr->partitionData[model]->ascVector[(tInfo->qNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                 x2_ascColumn = &pr->partitionData[model]->ascVector[(tInfo->rNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                 x3_ascColumn = &pr->partitionData[model]->ascVector[(tInfo->pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                 
+                 ex1_asc = &pr->partitionData[model]->ascExpVector[(tInfo->qNumber - tr->mxtips - 1) * ascWidth];
+                 ex2_asc = &pr->partitionData[model]->ascExpVector[(tInfo->rNumber - tr->mxtips - 1) * ascWidth];
+                 ex3_asc = &pr->partitionData[model]->ascExpVector[(tInfo->pNumber - tr->mxtips - 1) * ascWidth];
+
+                 for(k = 0; k < ascWidth; k++)
+                   ex3_asc[k] = ex1_asc[k] + ex2_asc[k];
+               }
+            /* both child nodes are inner nodes, thus the initial value of the scaling vector 
+               ex3 is the sum of the scaling values of the left and right child node */
+
+            if(!fastScaling)
+              {
+                size_t
+                  k;
+                      
+                int            
+                  *ex1      = pr->partitionData[model]->expVector[q_slot],
+                  *ex2      = pr->partitionData[model]->expVector[r_slot];                    
+                      
+                  for(k = 0; k < width; k++)
+                    ex3[k] = ex1[k] + ex2[k];
+              }
+            break;
+          default:
+            assert(0);
+          }
+
+        /* set the pointers to the left and right P matrices to the pre-allocated memory space for storing them */
+
+        left  = pr->partitionData[model]->left;
+        right = pr->partitionData[model]->right;
+
+        /* if we use per-partition branch length optimization 
+           get the branch length of partition model and take the log otherwise 
+           use the joint branch length among all partitions that is always stored 
+           at index [0] */
+
+        if(pr->perGeneBranchLengths)
+        {
+          qz = tInfo->qz[model];                                    
+          rz = tInfo->rz[model];                  
+        }
+        else
+        {
+          qz = tInfo->qz[0];
+          rz = tInfo->rz[0];
+        }
+
+        qz = (qz > PLL_ZMIN) ? log(qz) : log(PLL_ZMIN);                        
+        rz = (rz > PLL_ZMIN) ? log(rz) : log(PLL_ZMIN);                       
+
+        /* compute the left and right P matrices */
+
+        if(pr->partitionData[model]->dataType == PLL_AA_DATA &&
+        		(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X))
+                makeP_FlexLG4(qz, rz, pr->partitionData[model]->gammaRates,
+                              pr->partitionData[model]->EI_LG4,
+                              pr->partitionData[model]->EIGN_LG4,
+                              4, left, right, 20);
+        else
+        makeP(qz, rz, rateCategories,   pr->partitionData[model]->EI,
+              pr->partitionData[model]->EIGN, categories,
+              left, right, tr->saveMemory, tr->maxCategories, states);
+
+
+#if (!defined(__SSE3) && !defined(__AVX) && !defined(__MIC_NATIVE))
+        assert(!tr->saveMemory);
+
+        /* figure out if we need to compute the CAT or GAMMA model of rate heterogeneity */
+
+        if(tr->rateHetModel == PLL_CAT)
+         {
+
+           newviewCAT_FLEX(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                           x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                           ex3, tipX1, tipX2,
+                           width, left, right, wgt, &scalerIncrement, fastScaling, states);
+         }
+        else 
+         {
+            newviewGAMMA_FLEX(tInfo->tipCase,
+                 x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                 0, tipX1, tipX2,
+                 width, left, right, wgt, &scalerIncrement, fastScaling, states, getUndetermined(pr->partitionData[model]->dataType) + 1);
+         }
+#else
+        /* dedicated highly optimized functions. Analogously to the functions in evaluateGeneric() 
+           we also siwtch over the state number */
+
+        switch(states)
+        {               
+        case 2:
+          assert (!tr->saveMemory);
+          if (tr->rateHetModel == PLL_CAT)
+           {
+             newviewGTRCAT_BINARY(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                  x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                  ex3, tipX1, tipX2,
+                                  width, left, right, wgt, &scalerIncrement, fastScaling);
+           }
+          else
+           {
+             newviewGTRGAMMA_BINARY(tInfo->tipCase,
+                                    x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                    ex3, tipX1, tipX2,
+                                    width, left, right, wgt, &scalerIncrement, fastScaling);                  
+           }
+          break;
+
+        case 4: /* DNA */
+#ifdef __MIC_NATIVE
+
+              /* CAT & memory saving are not supported on MIC */
+
+              assert(!tr->saveMemory);
+              assert(tr->rateHetModel == PLL_GAMMA);
+
+              newviewGTRGAMMA_MIC(tInfo->tipCase,
+                                x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                ex3, tipX1, tipX2,
+                                width, left, right, wgt, &scalerIncrement, fastScaling);
+#else
+          if(tr->rateHetModel == PLL_CAT)
+            {                                
+              
+              if(tr->saveMemory)
+#ifdef __AVX
+                newviewGTRCAT_AVX_GAPPED_SAVE(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                              x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                              ex3, tipX1, tipX2,
+                                              width, left, right, wgt, &scalerIncrement, fastScaling, x1_gap, x2_gap, x3_gap,
+                                              x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
+#else
+                newviewGTRCAT_SAVE(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                   x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                   ex3, tipX1, tipX2,
+                                   width, left, right, wgt, &scalerIncrement, fastScaling, x1_gap, x2_gap, x3_gap,
+                                   x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
+#endif
+              else
+#ifdef __AVX
+                newviewGTRCAT_AVX(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                  x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                  ex3, tipX1, tipX2,
+                                  width, left, right, wgt, &scalerIncrement, fastScaling);
+#else
+              newviewGTRCAT(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                            x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                            ex3, tipX1, tipX2,
+                            width, left, right, wgt, &scalerIncrement, fastScaling);
+#endif
+            }
+          else
+            {
+              
+              if(tr->saveMemory)
+#ifdef __AVX
+                newviewGTRGAMMA_AVX_GAPPED_SAVE(tInfo->tipCase,
+                                                x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                                ex3, tipX1, tipX2,
+                                                width, left, right, wgt, &scalerIncrement, fastScaling,
+                                                x1_gap, x2_gap, x3_gap, 
+                                                x1_gapColumn, x2_gapColumn, x3_gapColumn);
+
+#else
+              newviewGTRGAMMA_GAPPED_SAVE(tInfo->tipCase,
+                                          x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                          ex3, tipX1, tipX2,
+                                          width, left, right, wgt, &scalerIncrement, fastScaling,
+                                          x1_gap, x2_gap, x3_gap, 
+                                          x1_gapColumn, x2_gapColumn, x3_gapColumn);
+#endif
+              else
+#ifdef __AVX
+                newviewGTRGAMMA_AVX(tInfo->tipCase,
+                                    x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                    ex3, tipX1, tipX2,
+                                    width, left, right, wgt, &scalerIncrement, fastScaling);
+#else
+              newviewGTRGAMMA(tInfo->tipCase,
+                              x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                              ex3,tipX1, tipX2,
+                              width, left, right, wgt, &scalerIncrement, fastScaling);
+#endif
+            }
+#endif
+
+            break;                  
+          case 20: /* proteins */
+
+#ifdef __MIC_NATIVE
+
+                        /* CAT & memory saving are not supported on MIC */
+
+                        assert(!tr->saveMemory);
+                        assert(tr->rateHetModel == PLL_GAMMA);
+
+                        if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                        {
+                                  newviewGTRGAMMAPROT_LG4_MIC(tInfo->tipCase,
+                            x1_start, x2_start, x3_start, pr->partitionData[model]->EV_LG4, pr->partitionData[model]->tipVector_LG4,
+                            tipX1, tipX2,
+                            width, left, right, wgt, &scalerIncrement);
+                        }
+                        else
+                        {
+                                  newviewGTRGAMMAPROT_MIC(tInfo->tipCase,
+                                                x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                                ex3, tipX1, tipX2,
+                                                width, left, right, wgt, &scalerIncrement, fastScaling);
+                        }
+#else
+
+            if(tr->rateHetModel == PLL_CAT)
+            {
+
+
+              if(tr->saveMemory)
+#ifdef __AVX
+                newviewGTRCATPROT_AVX_GAPPED_SAVE(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                                  x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                                  ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, fastScaling, 
+                                                  x1_gap, x2_gap, x3_gap,
+                                                  x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
+#else
+              newviewGTRCATPROT_SAVE(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                     x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                     ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, fastScaling, x1_gap, x2_gap, x3_gap,
+                                     x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
+#endif
+              else
+#ifdef __AVX
+                newviewGTRCATPROT_AVX(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                      x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                      ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, fastScaling);
+#else
+              newviewGTRCATPROT(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, fastScaling);                     
+#endif
+            }
+            else
+            {
+
+              
+
+              if(tr->saveMemory)
+#ifdef __AVX
+                newviewGTRGAMMAPROT_AVX_GAPPED_SAVE(tInfo->tipCase,
+                                                    x1_start, x2_start, x3_start,
+                                                    pr->partitionData[model]->EV,
+                                                    pr->partitionData[model]->tipVector,
+                                                    ex3, tipX1, tipX2,
+                                                    width, left, right, wgt, &scalerIncrement, fastScaling,
+                                                    x1_gap, x2_gap, x3_gap,
+                                                    x1_gapColumn, x2_gapColumn, x3_gapColumn);
+#else
+                newviewGTRGAMMAPROT_GAPPED_SAVE(tInfo->tipCase,
+                                                x1_start, x2_start, x3_start,
+                                                pr->partitionData[model]->EV,
+                                                pr->partitionData[model]->tipVector,
+                                                ex3, tipX1, tipX2,
+                                                width, left, right, wgt, &scalerIncrement, fastScaling,
+                                                x1_gap, x2_gap, x3_gap,
+                                                x1_gapColumn, x2_gapColumn, x3_gapColumn);
+#endif
+            
+             else
+                        {
+                          if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                            {
+#ifdef __AVX 
+                              newviewGTRGAMMAPROT_AVX_LG4(tInfo->tipCase,
+                                                          x1_start, x2_start, x3_start,
+                                                          pr->partitionData[model]->EV_LG4,
+                                                          pr->partitionData[model]->tipVector_LG4,
+                                                          (int*)NULL, tipX1, tipX2,
+                                                          width, left, right, wgt, &scalerIncrement, PLL_TRUE);
+#else
+                              newviewGTRGAMMAPROT_LG4(tInfo->tipCase,
+                                                      x1_start, x2_start, x3_start,
+                                                      pr->partitionData[model]->EV_LG4,
+                                                      pr->partitionData[model]->tipVector_LG4,
+                                                      (int*)NULL, tipX1, tipX2,
+                                                      width, left, right, 
+                                                      wgt, &scalerIncrement, PLL_TRUE);
+#endif                      
+                            }
+              else
+#ifdef __AVX
+                newviewGTRGAMMAPROT_AVX(tInfo->tipCase,
+                                        x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                        ex3, tipX1, tipX2,
+                                        width, left, right, wgt, &scalerIncrement, fastScaling);
+#else
+              newviewGTRGAMMAPROT(tInfo->tipCase,
+                                  x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                  ex3, tipX1, tipX2,
+                                  width, left, right, wgt, &scalerIncrement, fastScaling);
+#endif                 
+            }   
+        }
+#endif
+            
+            break;      
+          default:
+            assert(0);
+        }
+#endif
+
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+       if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+       if(pr->partitionData[model]->ascBias)
+#endif         
+         {
+           switch(tr->rateHetModel)
+             {
+             case PLL_CAT:
+               {
+                 double 
+                   rates = 1.0;
+                 
+                 //need to re-calculate transition probabilities assuming a rate of 1.0 
+                 makeP(qz, rz, 
+                       &rates,  
+                       pr->partitionData[model]->EI,
+                       pr->partitionData[model]->EIGN,
+                       1, 
+                       left, right, 
+                       tr->saveMemory,
+                       tr->maxCategories,
+                       states);
+                 
+                 newviewAscCat(tInfo->tipCase,
+                               x1_ascColumn, x2_ascColumn, x3_ascColumn,
+                               pr->partitionData[model]->EV,
+                               pr->partitionData[model]->tipVector,
+                               ex3_asc,
+                               states, left, right, states);
+               }
+               break;
+             case PLL_GAMMA:
+               newviewAscGamma(tInfo->tipCase,
+                               x1_ascColumn, x2_ascColumn, x3_ascColumn,
+                               pr->partitionData[model]->EV,
+                               pr->partitionData[model]->tipVector,
+                               ex3_asc,
+                               states, left, right, states);                        
+               break;
+             default:
+               assert(0);
+             }
+         }
+
+
+        /* important step, here we essentiallt recursively compute the number of scaling multiplications 
+           at node p: it's the sum of the number of scaling multiplications already conducted 
+           for computing nodes q and r plus the scaling multiplications done at node p */
+
+        if(fastScaling)
+          {
+            pr->partitionData[model]->globalScaler[tInfo->pNumber] =
+              pr->partitionData[model]->globalScaler[tInfo->qNumber] +
+              pr->partitionData[model]->globalScaler[tInfo->rNumber] +
+              (unsigned int)scalerIncrement;
+            
+            /* check that we are not getting an integer overflow ! */
+
+            assert(pr->partitionData[model]->globalScaler[tInfo->pNumber] < INT_MAX);
+          }
+        
+        /* show the output vector */
+      } 
+    }
+  }
+}
+
+/** @brief Compute the traversal descriptor of the subtree rooted at \a p.
+    
+    Computes the traversal descriptor of the subtree with root \a p. By traversal
+    descriptory we essentially mean a preorder traversal of the unrooted topology
+    by rooting it at a node \a p.
+    If \a partialTraversal is set to \b PLL_TRUE then subtrees which are oriented
+    correctly (i.e. if root node \a r of a subtree has \a r->x == 1) are not
+    included in the traversal descriptor.
+
+    @param tr
+      PLL instance
+
+    @param p
+      Node assumed to be the root
+
+    @param partialTraversal
+      If set to \b PLL_TRUE, then a partial traversal descriptor is computed.
+
+    @param numBranches
+      Number of branches (either per-partition branch or joint branch estimate)
+*/
+void computeTraversal(pllInstance *tr, nodeptr p, pllBoolean partialTraversal, int numBranches)
+{
+  /* Only if we apply recomputations we need the additional step of updating the subtree lengths */
+  if(tr->useRecom)
+  {
+    int traversal_counter = 0;
+    if(partialTraversal)
+      computeTraversalInfoStlen(p, tr->mxtips, tr->rvec, &traversal_counter);
+    else
+      computeFullTraversalInfoStlen(p, tr->mxtips, tr->rvec);
+  }
+  computeTraversalInfo(p, &(tr->td[0].ti[0]), &(tr->td[0].count), tr->mxtips, numBranches, partialTraversal, tr->rvec, tr->useRecom);
+}
+
+
+/** @brief Computes the conditional likelihood vectors of all nodes in the subtree rooted at \a p
+  
+    Compute the conditional likelihood vectors of all nodes in the subtree rooted at node \a p. The
+    conditional likelihood vector at node \a p is recomputed regardless of whether the orientation (i.e. \a p->x) 
+    is correct or not, and, recursuvely, the likelihoods at each node in the subtree as needed and if necessary.
+    In case \a masked is set to \b PLL_TRUE, the computation will not take place at partitions for which the 
+    conditional likelihood has converged (for example as a reult of previous branch length optimization).
+    
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Root of the subtree for which we want to recompute the conditional likelihood vectors
+
+    @param masked
+      If set to \b PLL_TRUE, then likelihood vectors of partitions that are converged are
+      not recomputed.
+ */
+void pllUpdatePartials (pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean masked)
+{  
+  /* if it's a tip there is nothing to do */
+
+  if(isTip(p->number, tr->mxtips))
+    return;
+
+  /* the first entry of the traversal descriptor is always reserved for evaluate or branch length optimization calls,
+     hence we start filling the array at the second entry with index one. This is not very nice and should be fixed 
+     at some point */
+
+  tr->td[0].count = 0;
+
+  /* compute the traversal descriptor, which will include nodes-that-need-update descending the subtree  p */
+  computeTraversal(tr, p, PLL_TRUE, pr->perGeneBranchLengths?pr->numberOfPartitions : 1);
+
+  /* the traversal descriptor has been recomputed -> not sure if it really always changes, something to 
+     optimize in the future */
+  tr->td[0].traversalHasChanged = PLL_TRUE;
+
+  /* We do a masked newview, i.e., do not execute newvies for each partition, when for example 
+     doing a branch length optimization on the entire tree when branches are estimated on a per partition basis.
+
+     you may imagine that for partition 5 the branch length optimization has already converged whereas 
+     for partition 6 we still need to go over the tree again.
+
+     This is explained in more detail in:
+
+     A. Stamatakis, M. Ott: "Load Balance in the Phylogenetic Likelihood Kernel". Proceedings of ICPP 2009
+
+     The external pllBoolean array tr->partitionConverged[] contains exactly that information and is copied
+     to executeModel and subsequently to the executeMask of the traversal descriptor 
+
+*/
+
+
+  if(masked)
+  {
+    int model;
+
+    for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      if(tr->partitionConverged[model])
+        pr->partitionData[model]->executeModel = PLL_FALSE;
+      else
+        pr->partitionData[model]->executeModel = PLL_TRUE;
+    }
+  }
+
+  /* if there is something to re-compute */
+
+  if(tr->td[0].count > 0)
+  {
+    /* store execute mask in traversal descriptor */
+
+    storeExecuteMaskInTraversalDescriptor(tr, pr);
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+    /* do the parallel for join for pthreads
+       not that we do not need a reduction operation here, but just a barrier to make 
+       sure that all threads are done with their partition */
+
+    pllMasterBarrier(tr, pr, PLL_THREAD_NEWVIEW);
+#else
+    /* in the sequential case we now simply call pllNewviewIterative() */
+
+    pllNewviewIterative(tr, pr, 0);
+#endif
+
+  }
+
+  /* clean up */
+
+  if(masked)
+  {
+    int model;
+
+    for(model = 0; model < pr->numberOfPartitions; model++)
+      pr->partitionData[model]->executeModel = PLL_TRUE;
+  }
+
+  tr->td[0].traversalHasChanged = PLL_FALSE;
+}
+
+/* function to compute the marginal ancestral probability vector at a node p for CAT/PSR model */
+
+/** @brief Compute the marginal ancestral probability vector for CAT/PSR model
+    
+    Computes the marginal ancestral probability vector for CAT/PSR model, given the conditional likelihood
+    vector \a x3 of some node, and a zero branch length P matrix \a diagptable.
+
+    @param x3
+      Conditional likelihood of the node for which we are computing the ancestral vector
+
+    @param ancestralBuffer
+      Buffer where to store the marginal ancestral probability vector
+
+    @param diagptable
+      A zero branch length P matrix
+
+    @param n
+      Number of sites in the partition to process (in the case of MPI/PTHREADS, the number of sites in the partition assigned to the current thread/process)
+
+    @param numStates
+      Number of states
+
+    @param cptr
+      Array where the rate for each site in the compressed partition alignment is stored
+      
+ */
+static void ancestralCat(double *x3, double *ancestralBuffer, double *diagptable, const int n, const int numStates, int *cptr)
+{ 
+  double 
+    *term = (double*)rax_malloc(sizeof(double) * numStates);
+
+  int 
+    i;
+
+  const int
+    statesSquare = numStates * numStates;
+  
+  for(i = 0; i < n; i++)
+    {
+      double 
+        sum = 0.0,
+        *v = &x3[numStates * i],
+        *ancestral = &ancestralBuffer[numStates * i],
+        *d = &diagptable[cptr[i] * statesSquare];            
+
+      int 
+        l,
+        j;
+
+      for(l = 0; l < numStates; l++)
+        {
+          double 
+            ump_x1 = 0.0;
+      
+          for(j = 0; j < numStates; j++)        
+            ump_x1 += v[j] * d[l * numStates + j];
+
+          sum += ump_x1;
+          term[l] = ump_x1;      
+        }
+                
+      for(l = 0; l < numStates; l++)          
+        ancestral[l] = term[l] / sum;   
+    }
+   
+  rax_free(term);
+}
+
+
+/* compute marginal ancestral states for GAMMA models,
+   for the euqation to obtain marginal ancestral states 
+   see Ziheng Yang's book */
+
+/** @brief Compute the marginal ancestral probability vector for GAMMA model
+    
+    Computes the marginal ancestral probability vector for the GAMMA model, given the conditional likelihood
+    vector \a x3 of some node, and a zero branch length P matrix \a diagptable.
+
+    @param x3
+      Conditional likelihood of the node for which we are computing the ancestral vector
+
+    @param ancestralBuffer
+      Buffer where to store the marginal ancestral probability vector
+
+    @param diagptable
+      A zero branch length P matrix
+
+    @param n
+      Number of sites in the partition to process (in the case of MPI/PTHREADS, the number of sites in the partition assigned to the current thread/process)
+
+    @param numStates
+      Number of states
+
+    @param gammaStates
+      Number of GAMMA categories times number of states
+      
+ */
+static void ancestralGamma(double *x3, double *ancestralBuffer, double *diagptable, const int n, const int numStates, const int gammaStates)
+{
+  int 
+    i;
+
+  const int
+    statesSquare = numStates * numStates;
+
+  double    
+    *term = (double*)rax_malloc(sizeof(double) * numStates);                  
+  
+  for(i = 0; i < n; i++)
+    {
+      double 
+        sum = 0.0,
+        *_v = &x3[gammaStates * i],
+        *ancestral = &ancestralBuffer[numStates * i];  
+      
+      int
+        k,
+        j,
+        l;
+      
+      for(l = 0; l < numStates; l++)
+        term[l] = 0.0;
+
+      for(k = 0; k < 4; k++)
+        {
+          double 
+            *v =  &(_v[numStates * k]);
+
+          for(l = 0; l < numStates; l++)
+            {
+              double
+                al = 0.0;
+              
+              for(j = 0; j < numStates; j++)        
+                al += v[j] * diagptable[k * statesSquare + l * numStates + j];
+          
+              term[l] += al;
+              sum += al;
+            }
+        }
+  
+      for(l = 0; l < numStates; l++)        
+        ancestral[l] = term[l] / sum;       
+    }
+   
+  rax_free(term);
+}
+
+/* compute dedicated zero branch length P matrix */
+/** @brief Compute a dedicated zero branch length P matrix
+   
+    Computes a P matrix by assuming a branch length of zero. This is used
+    for the marginal ancestral probabilities recomputation.
+
+    @param rptr
+      Array of values for rate categories
+
+    @param EI
+      Inverse eigenvector of Q matrix
+
+    @param EIGN
+      Eigenvalues of Q matrix
+
+    @param numberOfCategories
+      Number of rate categories
+
+    @param left
+      Where to store the resulting P matrix
+
+    @param numStates
+      Number of states
+ */
+static void calc_diagp_Ancestral(double *rptr, double *EI,  double *EIGN, int numberOfCategories, double *left, const int numStates)
+{
+  int 
+    i,
+    j,
+    k;
+  
+  const int   
+    statesSquare = numStates * numStates;
+
+  double 
+    z1 = 0.0,
+    lz1[64],
+    d1[64];
+
+  assert(numStates <= 64);
+     
+  for(i = 0; i < numStates; i++)    
+    lz1[i] = EIGN[i] * z1;
+     
+
+  for(i = 0; i < numberOfCategories; i++)
+    {
+      d1[0] = 1.0;
+
+      for(j = 1; j < numStates; j++)    
+        d1[j] = exp(rptr[i] * lz1[j]);
+         
+      for(j = 0; j < numStates; j++)
+        {
+          left[statesSquare * i  + numStates * j] = 1.0;         
+
+          for(k = 1; k < numStates; k++)            
+            left[statesSquare * i + numStates * j + k]  = d1[k] * EI[numStates * j + k];             
+        }
+    }  
+}
+
+/** @brief A very simple iterative function, we only access the conditional likelihood vector at node \a p
+ *
+ *
+ */
+void newviewAncestralIterative(pllInstance *tr, partitionList *pr)
+{
+  traversalInfo 
+    *ti    = tr->td[0].ti,
+    *tInfo = &ti[0];
+
+  int    
+    model,
+    p_slot = -1;
+
+  /* make sure that the traversal descriptor has length 1 */
+
+  assert(tr->td[0].count == 1);
+  assert(!tr->saveMemory);
+
+  /* get the index to the conditional likelihood vector depending on whether recomputation is used or not */
+
+  if(tr->useRecom)    
+    p_slot = tInfo->slot_p;         
+  else    
+    p_slot = tInfo->pNumber - tr->mxtips - 1;         
+
+  /* now loop over all partitions for nodes p of the current traversal vector entry */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      /* number of sites in this partition */
+      size_t            
+        width  = (size_t)pr->partitionData[model]->width;
+
+      /* this conditional statement is exactly identical to what we do in pllEvaluateIterative */
+
+      if(tr->td[0].executeModel[model] && width > 0)
+        {             
+          double         
+            *x3_start = pr->partitionData[model]->xVector[p_slot],
+//          *left     = (double*)NULL,
+//          *right    = (double*)NULL,                 
+            *rateCategories = (double*)NULL,
+            *diagptable = (double*)NULL;
+
+          int
+            categories;
+        
+          size_t                  
+            states = (size_t)pr->partitionData[model]->states,
+            availableLength = pr->partitionData[model]->xSpaceVector[p_slot],
+            requiredLength = 0,
+            rateHet = discreteRateCategories(tr->rateHetModel);   
+
+        /* figure out what kind of rate heterogeneity approach we are using */
+
+          if(tr->rateHetModel == PLL_CAT)
+            {            
+              rateCategories = pr->partitionData[model]->perSiteRates;
+              categories     = pr->partitionData[model]->numberOfCategories;
+            }
+          else
+            {                            
+              rateCategories = pr->partitionData[model]->gammaRates;
+              categories     = 4;
+            }
+          
+          /* allocate some space for a special P matrix with a branch length of 0 into which we mingle 
+             the eignevalues. This will allow us to obtain real probabilites from the internal RAxML 
+             representation */
+
+          rax_posix_memalign ((void **)&diagptable, PLL_BYTE_ALIGNMENT, categories * states * states * sizeof(double));
+          
+          requiredLength  =  virtual_width( width ) * rateHet * states * sizeof(double);
+          
+          /* make sure that this vector had already been allocated. This must be PLL_TRUE since we first invoked a standard newview() on this */
+
+          assert(requiredLength == availableLength);                                     
+
+          /* now compute the special P matrix */
+
+          calc_diagp_Ancestral(rateCategories, pr->partitionData[model]->EI,  pr->partitionData[model]->EIGN, categories, diagptable, states);
+          
+          /* switch over the rate heterogeneity model 
+             and call generic functions that compute the marginal ancestral states and 
+             store them in pr->partitionData[model]->ancestralBuffer
+          */
+
+          if(tr->rateHetModel == PLL_CAT)       
+            ancestralCat(x3_start, pr->partitionData[model]->ancestralBuffer, diagptable, width, states, pr->partitionData[model]->rateCategory);
+          else
+            ancestralGamma(x3_start, pr->partitionData[model]->ancestralBuffer, diagptable, width, states, categories * states);
+          
+          rax_free(diagptable);                   
+        }       
+    }
+}
+
+/** @brief Computes the Conditional Likelihood Vector (CLV) for each rate of some internal node.
+
+    Computes the conditional likelihood vectors of node \a p for each rate, given the partition
+    index \a partition. The result is placed in the array \a outProbs, which must be pre-allocated
+    by the caller, and must be of size \a sites * categories * states * sizeof(double). The structure of
+    the resulting array is the following:
+    For each site we have \a categories * states cells of size \a double. Those cells are divided per rate
+    category, i.e. first \a states cells are the probabilities for the states of rate 1 (ordered alphabetically
+    by base name), next \a states cells for rate 2 and so on.
+
+    @param tr   PLL instance
+    @param pr     List of partitions
+    @param p Node for which we want to compute the CLV
+    @param partition   Index of the partition for which to compute the CLV
+    @param outProbs    Pre-allocated array where the result will be stored
+
+    @returns Returns \b PLL_TRUE on success, \b PLL_FALSE on failure
+
+    @todo       Fix to work with CAT
+*/
+int pllGetCLV (pllInstance * tr, partitionList * pr, nodeptr p, int partition, double * outProbs)
+{
+  size_t i, j, k, l;
+
+  if (tr->rateHetModel != PLL_GAMMA) return (PLL_FALSE);
+
+  int p_slot;
+  size_t states = (size_t)pr->partitionData[partition]->states;
+
+  double
+    *term = (double*)rax_malloc(sizeof(double) * states);
+
+  if(tr->useRecom)
+    p_slot = p->number;
+  else
+    p_slot = p->number - tr->mxtips - 1;
+
+  size_t width = (size_t) pr->partitionData[partition]->width;
+  double * diagptable = NULL;
+  double * rateCategories = pr->partitionData[partition]->gammaRates;
+  double * x3 = pr->partitionData[partition]->xVector[p_slot];
+  size_t categories = 4;
+
+  rax_posix_memalign ((void **)&diagptable, PLL_BYTE_ALIGNMENT, categories * states * states * sizeof (double));
+
+  calc_diagp_Ancestral(rateCategories, pr->partitionData[partition]->EI,  pr->partitionData[partition]->EIGN, categories, diagptable, states);
+
+  for (i = 0; i < width; ++ i)
+   {
+     double
+       *_v  = &x3[categories * states * i],
+       *clv = &outProbs[categories * states * i];
+
+     for (k = 0; k < categories; ++ k)
+      {
+        double
+         sum = 0.0,
+         *v = &(_v[states * k]);
+
+        for (l = 0; l < states; ++ l)
+         {
+           double al = 0.0;
+
+           for (j = 0; j < states; ++ j)
+             al += v[j] * diagptable[k * states * states + l * states + j];
+
+           term[l] = al;
+           sum += al;
+         }
+        for (l = 0; l < states; ++ l)
+           clv[k * categories + l] = term[l] / sum;
+      }
+   }
+
+  rax_free(term);
+  rax_free(diagptable);
+
+  return (PLL_TRUE);
+}
+
+/* this is very similar to pllUpdatePartials, except that it also computes the marginal ancestral probabilities 
+   at node p. To simplify the code I am re-using newview() here to first get the likelihood vector p->x at p
+   and then I deploy newviewAncestralIterative(tr); that should always only have a traversal descriptor of lenth 1,
+   to do some mathematical transformations that are required to obtain the marginal ancestral probabilities from 
+   the conditional likelihood array at p.
+
+   Note that the marginal ancestral probability vector summarizes the subtree rooted at p! */
+
+/** @brief Computes the conditional likelihood vectors of all nodes in the subtree rooted at \a p
+    and the marginal ancestral probabilities at node \a p
+
+    Compute the conditional likelihood vectors of all nodes in the subtree rooted at node \a p. The
+    conditional likelihood vector at node \a p is recomputed regardless of whether the orientation (i.e. \a p->x)
+    is correct or not, and, recursively, the likelihoods at each node in the subtree as needed and if necessary.
+    In addition, the marginal ancestral probability vector for node \a p is also computed.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Node for which we want to compute the ancestral vector
+
+    @note
+      This function is not implemented with the saveMemory technique. 
+*/
+void pllUpdatePartialsAncestral(pllInstance *tr, partitionList *pr, nodeptr p)
+{
+  /* error check, we don't need to compute anything for tips */
+  
+  if(isTip(p->number, tr->mxtips))
+    {
+      printf("You are trying to compute the ancestral states on a tip node of the tree\n");
+      assert(0);
+    }
+
+  /* doesn't work yet in conjunction with SEVs, can be implemented though at some point 
+     if urgently required */
+
+  if(tr->saveMemory)
+    {
+      printf("ancestral state implementation will not work with memory saving (SEVs) enabled!\n");
+      printf("returning without computing anything ... \n");
+      return;
+    }
+
+  /* first call pllUpdatePartials() with mask set to PLL_FALSE such that the likelihood vector is there ! */
+
+  pllUpdatePartials(tr, pr, p, PLL_FALSE);
+
+  /* now let's compute the ancestral states using this vector ! */
+  
+  /* to make things easy and reduce code size, let's re-compute a standard traversal descriptor for node p,
+     hence we need to set the count to 0 */
+
+  tr->td[0].count = 0;
+
+  computeTraversalInfo(p, &(tr->td[0].ti[0]), &(tr->td[0].count), tr->mxtips, pr->perGeneBranchLengths?pr->numberOfPartitions : 1, PLL_TRUE, tr->rvec, tr->useRecom);
+
+  tr->td[0].traversalHasChanged = PLL_TRUE;
+
+  /* here we actually assert, that the traversal descriptor only contains one node triplet p, p->next->back, p->next->next->back
+     this must be PLL_TRUE because we have alread invoked the standard pllUpdatePartials() on p.
+  */ 
+
+  assert(tr->td[0].count == 1);  
+  
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  /* use the pthreads barrier to invoke newviewAncestralIterative() on a per-thread basis */
+
+  pllMasterBarrier (tr, pr, PLL_THREAD_NEWVIEW_ANCESTRAL);
+#else
+  /* now call the dedicated function that does the mathematical transformation of the 
+     conditional likelihood vector at p to obtain the marginal ancestral states */
+
+  newviewAncestralIterative(tr, pr);
+#endif
+
+  tr->td[0].traversalHasChanged = PLL_FALSE;
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  /* invoke another parallel region to gather the marginal ancestral probabilities 
+     from the threads/MPI processes */
+
+  pllMasterBarrier (tr, pr, PLL_THREAD_GATHER_ANCESTRAL);
+#endif
+
+  
+}
+
+/* returns the character representation of an enumerated DNA or AA state */
+
+/** @brief Get the character representation of an enumerated DNA or AA state
+    
+    Returns the character representation of the enumarates DNA or AA state,
+    from the constant arrays \a dnaStateNames (for DNA) or \a protStateNames (for proteins).
+
+    @param dataType
+      Type of data, i.e. \b PLL_DNA_DATA or \b PLL_AA_DATA
+
+    @param state
+      The number which we want to decode to a letter
+
+    @return
+      Returns the decoded character
+ */
+static char getStateCharacter(int dataType, int state)
+{
+  char 
+    result;
+
+  switch(dataType)
+    {    
+    case PLL_BINARY_DATA:
+       result = binaryStateNames[state];
+       break;
+    case PLL_DNA_DATA:
+       result = dnaStateNames[state];
+      break;
+    case PLL_AA_DATA:
+      result =  protStateNames[state];
+      break;    
+    default:
+      assert(0);
+    }
+
+  return  result;
+}
+
+/** @brief Prints the ancestral state information for a node \a p to the terminal 
+ 
+    Prints the ancestral state information for a node \a p to the terminal. 
+    The ancestral state sequence, resp. marginal ancestral state probabilities, is printed
+    depending on whether \a \a printStates, resp. \a printProbs, is set to \b PLL_TRUE.
+
+    @param p
+      The node for which to print the ancestral state sequence
+
+    @param printStates
+      If set to \b PLL_TRUE then the ancestral state sequence is printed
+
+    @param printProbs
+      If set to \b PLL_TRUE then the marginal ancestral state probabilities are printed
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+ 
+    @note  Here one can see how to store the ancestral probabilities in a dedicated data structure
+ */
+void printAncestralState(nodeptr p, pllBoolean printStates, pllBoolean printProbs, pllInstance *tr, partitionList *pr)
+{
+#ifdef _USE_PTHREADS
+  size_t 
+    accumulatedOffset = 0;
+#endif
+
+  int
+    j,
+    k,
+    model,
+    globalIndex = 0;
+  
+  /* allocate an array of structs for storing ancestral prob vector info/data */
+
+  ancestralState 
+    *a = (ancestralState *)rax_malloc(sizeof(ancestralState) * tr->originalCrunchedLength);   
+
+  /* loop over partitions */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      int            
+        i,
+        width = pr->partitionData[model]->upper - pr->partitionData[model]->lower,
+        states = pr->partitionData[model]->states;
+      
+      /* set pointer to ancestral probability vector */
+
+#ifdef _USE_PTHREADS
+      double
+        *ancestral = &tr->ancestralVector[accumulatedOffset];
+#else
+      double 
+        *ancestral = pr->partitionData[model]->ancestralBuffer;
+#endif        
+      
+      /* loop over the sites of the partition */
+
+      for(i = 0; i < width; i++, globalIndex++)
+        {
+          double
+            equal = 1.0 / (double)states,
+            max = -1.0;
+            
+          pllBoolean
+            approximatelyEqual = PLL_TRUE;
+
+          int
+            max_l = -1,
+            l;
+          
+          char 
+            c;
+
+          /* stiore number of states for this site */
+
+          a[globalIndex].states = states;
+
+          /* alloc space for storing marginal ancestral probabilities */
+
+          a[globalIndex].probs = (double *)rax_malloc(sizeof(double) * states);
+          
+          /* loop over states to store probabilities and find the maximum */
+
+          for(l = 0; l < states; l++)
+            {
+              double 
+                value = ancestral[states * i + l];
+
+              if(value > max)
+                {
+                  max = value;
+                  max_l = l;
+                }
+              
+              /* this is used for discretizing the ancestral state sequence, if all marginal ancestral 
+                 probabilities are approximately equal we output a ? */
+
+              approximatelyEqual = approximatelyEqual && (PLL_ABS(equal - value) < 0.000001);
+              
+              a[globalIndex].probs[l] = value;                
+            }
+
+          
+          /* figure out the discrete ancestral nucleotide */
+
+          if(approximatelyEqual)
+            c = '?';      
+          else
+            c = getStateCharacter(pr->partitionData[model]->dataType, max_l);
+          
+          a[globalIndex].c = c;   
+        }
+
+#ifdef _USE_PTHREADS
+      accumulatedOffset += width * states;
+#endif            
+    }
+
+  /* print marginal ancestral probs to terminal */
+
+  if(printProbs)
+    {
+      printf("%d\n", p->number);
+      
+      for(k = 0; k < tr->originalCrunchedLength; k++)
+        {
+          for(j = 0; j < a[k].states; j++)
+            printf("%f ", a[k].probs[j]);
+          printf("\n");      
+        }
+      
+      printf("\n");
+    }
+ 
+  /* print discrete state ancestrakl sequence to terminal */
+
+  if(printStates)
+    {
+      printf("%d ", p->number);
+
+      for(k = 0; k < tr->originalCrunchedLength; k++)          
+        printf("%c", a[k].c);   
+  
+      printf("\n");
+    }
+  
+  /* free the ancestral state data structure */
+          
+  for(j = 0; j < tr->originalCrunchedLength; j++)
+    rax_free(a[j].probs);  
+
+  rax_free(a);
+}
+
+void pllGetAncestralState(pllInstance *tr, partitionList *pr, nodeptr p, double * outProbs, char * outSequence)
+{
+#ifdef _USE_PTHREADS
+  size_t 
+    accumulatedOffset = 0;
+#endif
+
+  int
+    j,
+    k,
+    model,
+    globalIndex = 0;
+     
+  pllUpdatePartialsAncestral(tr, pr, p);
+  
+  /* allocate an array of structs for storing ancestral prob vector info/data */
+
+  ancestralState 
+    *a = (ancestralState *)rax_malloc(sizeof(ancestralState) * tr->originalCrunchedLength);   
+
+  /* loop over partitions */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      int            
+        i,
+        width = pr->partitionData[model]->upper - pr->partitionData[model]->lower,
+        states = pr->partitionData[model]->states;
+      
+      /* set pointer to ancestral probability vector */
+
+#ifdef _USE_PTHREADS
+      double
+        *ancestral = &tr->ancestralVector[accumulatedOffset];
+#else
+      double 
+        *ancestral = pr->partitionData[model]->ancestralBuffer;
+#endif        
+      
+      /* loop over the sites of the partition */
+
+      for(i = 0; i < width; i++, globalIndex++)
+        {
+          double
+            equal = 1.0 / (double)states,
+            max = -1.0;
+            
+          pllBoolean
+            approximatelyEqual = PLL_TRUE;
+
+          int
+            max_l = -1,
+            l;
+          
+          char 
+            c;
+
+          /* stiore number of states for this site */
+
+          a[globalIndex].states = states;
+
+          /* alloc space for storing marginal ancestral probabilities */
+
+          a[globalIndex].probs = (double *)rax_malloc(sizeof(double) * states);
+          
+          /* loop over states to store probabilities and find the maximum */
+
+          for(l = 0; l < states; l++)
+            {
+              double 
+                value = ancestral[states * i + l];
+
+              if(value > max)
+                {
+                  max = value;
+                  max_l = l;
+                }
+              
+              /* this is used for discretizing the ancestral state sequence, if all marginal ancestral 
+                 probabilities are approximately equal we output a ? */
+
+              approximatelyEqual = approximatelyEqual && (PLL_ABS(equal - value) < 0.000001);
+              
+              a[globalIndex].probs[l] = value;                
+            }
+
+          
+          /* figure out the discrete ancestral nucleotide */
+
+          if(approximatelyEqual)
+            c = '?';      
+          else
+            c = getStateCharacter(pr->partitionData[model]->dataType, max_l);
+          
+          a[globalIndex].c = c;   
+        }
+
+#ifdef _USE_PTHREADS
+      accumulatedOffset += width * states;
+#endif            
+    }
+
+  /* print marginal ancestral probs to terminal */
+
+  for(k = 0; k < tr->originalCrunchedLength; k++)
+    {
+      for(j = 0; j < a[k].states; j++)
+        outProbs[k * a[k].states + j] = a[k].probs[j];
+    }
+ 
+  /* print discrete state ancestrakl sequence to terminal */
+
+  for(k = 0; k < tr->originalCrunchedLength; k++)          
+      outSequence[k] = a[k].c;
+  outSequence[tr->originalCrunchedLength] = 0;
+  
+  /* free the ancestral state data structure */
+          
+  for(j = 0; j < tr->originalCrunchedLength; j++)
+    rax_free(a[j].probs);  
+
+  rax_free(a);
+}
+/* optimized function implementations */
+
+
+/**
+ *  @defgroup group1 Optimized functions
+ *  This is the optimized functions group
+ */
+
+#if (!defined(__AVX) && defined(__SSE3))
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR GAMMA with memory saving (Optimized SSE3 version for DNA data)
+
+    This is the SSE3 optimized version of ::newviewGAMMA_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b GAMMA
+    model of rate heterogeneity. The memory saving technique is incorporated.
+
+    @note
+    For more details and function argument description check the function ::newviewGAMMA_FLEX
+*/
+static void newviewGTRGAMMA_GAPPED_SAVE(int tipCase,
+                                        double *x1_start, double *x2_start, double *x3_start,
+                                        double *EV, double *tipVector,
+                                        int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                        const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                        unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, 
+                                        double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn)
+{
+  int     
+    i, 
+    j, 
+    k, 
+    l,
+    addScale = 0, 
+    scaleGap = 0;
+
+  double
+    *x1,
+    *x2,
+    *x3,
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start,       
+    max;
+  PLL_ALIGN_BEGIN double
+    maxima[2] PLL_ALIGN_END,
+    EV_t[16] PLL_ALIGN_END;
+
+  __m128d 
+    values[8],
+    EVV[8];  
+
+  for(k = 0; k < 4; k++)
+    for (l=0; l < 4; l++)
+      EV_t[4 * l + k] = EV[4 * k + l];
+
+  for(k = 0; k < 8; k++)
+    EVV[k] = _mm_load_pd(&EV_t[k * 2]);      
+
+
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        double *uX1, *uX2;
+        PLL_ALIGN_BEGIN double umpX1[256] PLL_ALIGN_END, umpX2[256] PLL_ALIGN_END;
+
+
+        for (i = 1; i < 16; i++)
+        {           
+          __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
+          __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));       
+
+          for (j = 0; j < 4; j++)
+            for (k = 0; k < 4; k++)
+            {                            
+              __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
+              __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
+
+              __m128d acc = _mm_setzero_pd();
+
+              acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
+              acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+
+              acc = _mm_hadd_pd(acc, acc);
+              _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
+            }
+
+          for (j = 0; j < 4; j++)
+            for (k = 0; k < 4; k++)
+            {
+              __m128d left1 = _mm_load_pd(&right[j*16 + k*4]);
+              __m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]);
+
+              __m128d acc = _mm_setzero_pd();
+
+              acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
+              acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+
+              acc = _mm_hadd_pd(acc, acc);
+              _mm_storel_pd(&umpX2[i*16 + j*4 + k], acc);
+
+            }
+        }                 
+
+        uX1 = &umpX1[240];
+        uX2 = &umpX2[240];                          
+
+        for (j = 0; j < 4; j++)
+        {                                                                                  
+          __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
+          __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+
+          __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
+          __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
+
+          __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
+          __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );                                                 
+
+          __m128d EV_t_l0_k0 = EVV[0];
+          __m128d EV_t_l0_k2 = EVV[1];
+          __m128d EV_t_l1_k0 = EVV[2];
+          __m128d EV_t_l1_k2 = EVV[3];
+          __m128d EV_t_l2_k0 = EVV[4];
+          __m128d EV_t_l2_k2 = EVV[5];
+          __m128d EV_t_l3_k0 = EVV[6]; 
+          __m128d EV_t_l3_k2 = EVV[7];
+
+          EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+          EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+          EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+          EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+          EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+          EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+          EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+          EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+          EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+          EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+          _mm_store_pd( &x3_gapColumn[j * 4 + 0], EV_t_l0_k0 );
+          _mm_store_pd( &x3_gapColumn[j * 4 + 2], EV_t_l2_k0 );    
+        }  
+
+
+        x3 = x3_start;
+
+        for (i = 0; i < n; i++)
+        {           
+          if(!(x3_gap[i / 32] & mask32[i % 32]))             
+          {
+            uX1 = &umpX1[16 * tipX1[i]];
+            uX2 = &umpX2[16 * tipX2[i]];                                        
+
+            for (j = 0; j < 4; j++)
+            {                                                                              
+              __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
+              __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+
+
+              __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
+              __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
+
+
+              //
+              // multiply left * right
+              //
+
+              __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
+              __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
+
+
+              //
+              // multiply with EV matrix (!?)
+              //
+
+              __m128d EV_t_l0_k0 = EVV[0];
+              __m128d EV_t_l0_k2 = EVV[1];
+              __m128d EV_t_l1_k0 = EVV[2];
+              __m128d EV_t_l1_k2 = EVV[3];
+              __m128d EV_t_l2_k0 = EVV[4];
+              __m128d EV_t_l2_k2 = EVV[5];
+              __m128d EV_t_l3_k0 = EVV[6]; 
+              __m128d EV_t_l3_k2 = EVV[7];
+
+              EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+              EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+              EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+              EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+              EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+              EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+              EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+              EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+              EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+              EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+              EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+              EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+              EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+              EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+              _mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 );
+              _mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 );
+            }
+
+            x3 += 16;
+          }
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      { 
+        double 
+          *uX1;
+        PLL_ALIGN_BEGIN double
+          umpX1[256] PLL_ALIGN_END;
+
+        for (i = 1; i < 16; i++)
+        {
+          __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
+          __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));       
+
+          for (j = 0; j < 4; j++)
+            for (k = 0; k < 4; k++)
+            {            
+              __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
+              __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
+
+              __m128d acc = _mm_setzero_pd();
+
+              acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
+              acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+
+              acc = _mm_hadd_pd(acc, acc);
+              _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);                
+            }
+        }
+
+        {
+          __m128d maxv =_mm_setzero_pd();
+
+          scaleGap = 0;
+
+          x2 = x2_gapColumn;                     
+          x3 = x3_gapColumn;
+
+          uX1 = &umpX1[240];         
+
+          for (j = 0; j < 4; j++)
+          {                                
+            double *x2_p = &x2[j*4];
+            double *right_k0_p = &right[j*16];
+            double *right_k1_p = &right[j*16 + 1*4];
+            double *right_k2_p = &right[j*16 + 2*4];
+            double *right_k3_p = &right[j*16 + 3*4];
+            __m128d x2_0 = _mm_load_pd( &x2_p[0] );
+            __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+
+            __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
+            __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
+            __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
+            __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
+            __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
+            __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
+            __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
+            __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+
+            right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+            right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+            right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+            right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+            right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+            right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+            right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+            right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+            right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+            right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+            right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+            right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+            right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+            right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
+
+            __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
+            __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+
+            __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
+            __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
+
+            __m128d EV_t_l0_k0 = EVV[0];
+            __m128d EV_t_l0_k2 = EVV[1];
+            __m128d EV_t_l1_k0 = EVV[2];
+            __m128d EV_t_l1_k2 = EVV[3];
+            __m128d EV_t_l2_k0 = EVV[4];
+            __m128d EV_t_l2_k2 = EVV[5];
+            __m128d EV_t_l3_k0 = EVV[6]; 
+            __m128d EV_t_l3_k2 = EVV[7];
+
+            EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+            EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+            EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+            EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+            EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+            EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+            EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+            EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+            EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+            EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+            EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+            EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+            values[j * 2]     = EV_t_l0_k0;
+            values[j * 2 + 1] = EV_t_l2_k0;                                
+
+            maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
+            maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));                                    
+          }
+
+
+          _mm_store_pd(maxima, maxv);
+
+          max = PLL_MAX(maxima[0], maxima[1]);
+
+          if(max < PLL_MINLIKELIHOOD)
+          {
+            scaleGap = 1;
+
+            __m128d sv = _mm_set1_pd(PLL_TWOTOTHE256);
+
+            _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));       
+            _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
+            _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
+            _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
+            _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));       
+            _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
+            _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
+            _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));                        
+          }
+          else
+          {
+            _mm_store_pd(&x3[0], values[0]);       
+            _mm_store_pd(&x3[2], values[1]);
+            _mm_store_pd(&x3[4], values[2]);
+            _mm_store_pd(&x3[6], values[3]);
+            _mm_store_pd(&x3[8], values[4]);       
+            _mm_store_pd(&x3[10], values[5]);
+            _mm_store_pd(&x3[12], values[6]);
+            _mm_store_pd(&x3[14], values[7]);
+          }
+        }                       
+
+        x3 = x3_start;
+
+        for (i = 0; i < n; i++)
+        {
+          if((x3_gap[i / 32] & mask32[i % 32]))
+          {            
+            if(scaleGap)
+            {   
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];                  
+            }
+          }
+          else
+          {                              
+            __m128d maxv =_mm_setzero_pd();              
+
+            if(x2_gap[i / 32] & mask32[i % 32])
+              x2 = x2_gapColumn;
+            else
+            {
+              x2 = x2_ptr;
+              x2_ptr += 16;
+            }
+
+            uX1 = &umpX1[16 * tipX1[i]];             
+
+
+            for (j = 0; j < 4; j++)
+            {                              
+              double *x2_p = &x2[j*4];
+              double *right_k0_p = &right[j*16];
+              double *right_k1_p = &right[j*16 + 1*4];
+              double *right_k2_p = &right[j*16 + 2*4];
+              double *right_k3_p = &right[j*16 + 3*4];
+              __m128d x2_0 = _mm_load_pd( &x2_p[0] );
+              __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+
+              __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
+              __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
+              __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
+              __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
+              __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
+              __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
+              __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
+              __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+
+
+              right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+              right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+              right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+              right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+              right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+              right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+              right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+
+              right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+              right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+
+              right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+              right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+              right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+              right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+              right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
+
+              {
+                //
+                // load left side from tip vector
+                //
+
+                __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
+                __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+
+
+                //
+                // multiply left * right
+                //
+
+                __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
+                __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
+
+
+                //
+                // multiply with EV matrix (!?)
+                //                                
+
+                __m128d EV_t_l0_k0 = EVV[0];
+                __m128d EV_t_l0_k2 = EVV[1];
+                __m128d EV_t_l1_k0 = EVV[2];
+                __m128d EV_t_l1_k2 = EVV[3];
+                __m128d EV_t_l2_k0 = EVV[4];
+                __m128d EV_t_l2_k2 = EVV[5];
+                __m128d EV_t_l3_k0 = EVV[6]; 
+                __m128d EV_t_l3_k2 = EVV[7];
+
+
+                EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+                EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+                EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+                EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+                EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+                EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+                EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+                EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+                EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+                EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+                EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+                EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+                EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+                EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+                values[j * 2]     = EV_t_l0_k0;
+                values[j * 2 + 1] = EV_t_l2_k0;                            
+
+                maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
+                maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));                
+              }            
+            }
+
+
+            _mm_store_pd(maxima, maxv);
+
+            max = PLL_MAX(maxima[0], maxima[1]);
+
+            if(max < PLL_MINLIKELIHOOD)
+            {
+              __m128d sv = _mm_set1_pd(PLL_TWOTOTHE256);
+
+              _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));     
+              _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
+              _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
+              _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
+              _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));     
+              _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
+              _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
+              _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));      
+
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];
+
+            }
+            else
+            {
+              _mm_store_pd(&x3[0], values[0]);     
+              _mm_store_pd(&x3[2], values[1]);
+              _mm_store_pd(&x3[4], values[2]);
+              _mm_store_pd(&x3[6], values[3]);
+              _mm_store_pd(&x3[8], values[4]);     
+              _mm_store_pd(&x3[10], values[5]);
+              _mm_store_pd(&x3[12], values[6]);
+              _mm_store_pd(&x3[14], values[7]);
+            }            
+
+            x3 += 16;
+          }
+        }
+      }
+      break;
+    case PLL_INNER_INNER:         
+      {
+        __m128d maxv =_mm_setzero_pd();
+
+        scaleGap = 0;
+
+        x1 = x1_gapColumn;                  
+        x2 = x2_gapColumn;          
+        x3 = x3_gapColumn;
+
+        for (j = 0; j < 4; j++)
+        {
+
+          double *x1_p = &x1[j*4];
+          double *left_k0_p = &left[j*16];
+          double *left_k1_p = &left[j*16 + 1*4];
+          double *left_k2_p = &left[j*16 + 2*4];
+          double *left_k3_p = &left[j*16 + 3*4];
+
+          __m128d x1_0 = _mm_load_pd( &x1_p[0] );
+          __m128d x1_2 = _mm_load_pd( &x1_p[2] );
+
+          __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
+          __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
+          __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
+          __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
+          __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
+          __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
+          __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
+          __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
+
+          left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+          left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+          left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+          left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+          left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+          left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+          left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+          left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+          left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+          left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+
+          double *x2_p = &x2[j*4];
+          double *right_k0_p = &right[j*16];
+          double *right_k1_p = &right[j*16 + 1*4];
+          double *right_k2_p = &right[j*16 + 2*4];
+          double *right_k3_p = &right[j*16 + 3*4];
+          __m128d x2_0 = _mm_load_pd( &x2_p[0] );
+          __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+
+          __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
+          __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
+          __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
+          __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
+          __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
+          __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
+          __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
+          __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+
+          right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+          right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+          right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+          right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+          right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+          right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+          right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+          right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+          right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+          right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);                                    
+
+          __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+          __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );                                          
+
+          __m128d EV_t_l0_k0 = EVV[0];
+          __m128d EV_t_l0_k2 = EVV[1];
+          __m128d EV_t_l1_k0 = EVV[2];
+          __m128d EV_t_l1_k2 = EVV[3];
+          __m128d EV_t_l2_k0 = EVV[4];
+          __m128d EV_t_l2_k2 = EVV[5];
+          __m128d EV_t_l3_k0 = EVV[6]; 
+          __m128d EV_t_l3_k2 = EVV[7];
+
+          EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+          EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+          EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+          EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+          EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+          EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+          EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+          EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+          EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+          EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+
+          values[j * 2] = EV_t_l0_k0;
+          values[j * 2 + 1] = EV_t_l2_k0;                           
+
+          maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
+          maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
+        }
+
+        _mm_store_pd(maxima, maxv);
+
+        max = PLL_MAX(maxima[0], maxima[1]);
+
+        if(max < PLL_MINLIKELIHOOD)
+        {
+          __m128d sv = _mm_set1_pd(PLL_TWOTOTHE256);
+
+          scaleGap = 1;
+
+          _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));         
+          _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
+          _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
+          _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
+          _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));         
+          _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
+          _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
+          _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));                      
+        }
+        else
+        {
+          _mm_store_pd(&x3[0], values[0]);         
+          _mm_store_pd(&x3[2], values[1]);
+          _mm_store_pd(&x3[4], values[2]);
+          _mm_store_pd(&x3[6], values[3]);
+          _mm_store_pd(&x3[8], values[4]);         
+          _mm_store_pd(&x3[10], values[5]);
+          _mm_store_pd(&x3[12], values[6]);
+          _mm_store_pd(&x3[14], values[7]);
+        }
+      }
+
+
+      x3 = x3_start;
+
+      for (i = 0; i < n; i++)
+      { 
+        if(x3_gap[i / 32] & mask32[i % 32])
+        {            
+          if(scaleGap)
+          {     
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];                              
+          }
+        }
+        else
+        {
+          __m128d maxv =_mm_setzero_pd();                   
+
+          if(x1_gap[i / 32] & mask32[i % 32])
+            x1 = x1_gapColumn;
+          else
+          {
+            x1 = x1_ptr;
+            x1_ptr += 16;
+          }
+
+          if(x2_gap[i / 32] & mask32[i % 32])
+            x2 = x2_gapColumn;
+          else
+          {
+            x2 = x2_ptr;
+            x2_ptr += 16;
+          }
+
+
+          for (j = 0; j < 4; j++)
+          {
+
+            double *x1_p = &x1[j*4];
+            double *left_k0_p = &left[j*16];
+            double *left_k1_p = &left[j*16 + 1*4];
+            double *left_k2_p = &left[j*16 + 2*4];
+            double *left_k3_p = &left[j*16 + 3*4];
+
+            __m128d x1_0 = _mm_load_pd( &x1_p[0] );
+            __m128d x1_2 = _mm_load_pd( &x1_p[2] );
+
+            __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
+            __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
+            __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
+            __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
+            __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
+            __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
+            __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
+            __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
+
+            left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+            left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+            left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+            left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+            left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+            left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+            left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+            left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+            left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+            left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+            left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+            left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+            left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+            left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+
+            //
+            // multiply/add right side
+            //
+            double *x2_p = &x2[j*4];
+            double *right_k0_p = &right[j*16];
+            double *right_k1_p = &right[j*16 + 1*4];
+            double *right_k2_p = &right[j*16 + 2*4];
+            double *right_k3_p = &right[j*16 + 3*4];
+            __m128d x2_0 = _mm_load_pd( &x2_p[0] );
+            __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+
+            __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
+            __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
+            __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
+            __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
+            __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
+            __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
+            __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
+            __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+
+            right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+            right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+            right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+            right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+            right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+            right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+            right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+            right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+            right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+
+            right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+            right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+            right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+            right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+            right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);     
+
+            //
+            // multiply left * right
+            //
+
+            __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+            __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+
+
+            //
+            // multiply with EV matrix (!?)
+            //       
+
+            __m128d EV_t_l0_k0 = EVV[0];
+            __m128d EV_t_l0_k2 = EVV[1];
+            __m128d EV_t_l1_k0 = EVV[2];
+            __m128d EV_t_l1_k2 = EVV[3];
+            __m128d EV_t_l2_k0 = EVV[4];
+            __m128d EV_t_l2_k2 = EVV[5];
+            __m128d EV_t_l3_k0 = EVV[6]; 
+            __m128d EV_t_l3_k2 = EVV[7];
+
+
+            EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+            EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+            EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+            EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+            EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+            EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+            EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+            EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+            EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+
+            EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+            EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+            EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+
+            values[j * 2] = EV_t_l0_k0;
+            values[j * 2 + 1] = EV_t_l2_k0;                         
+
+            maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
+            maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
+          }
+
+
+          _mm_store_pd(maxima, maxv);
+
+          max = PLL_MAX(maxima[0], maxima[1]);
+
+          if(max < PLL_MINLIKELIHOOD)
+          {
+            __m128d sv = _mm_set1_pd(PLL_TWOTOTHE256);
+
+            _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));       
+            _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
+            _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
+            _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
+            _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));       
+            _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
+            _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
+            _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));        
+
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];
+
+          }
+          else
+          {
+            _mm_store_pd(&x3[0], values[0]);       
+            _mm_store_pd(&x3[2], values[1]);
+            _mm_store_pd(&x3[4], values[2]);
+            _mm_store_pd(&x3[6], values[3]);
+            _mm_store_pd(&x3[8], values[4]);       
+            _mm_store_pd(&x3[10], values[5]);
+            _mm_store_pd(&x3[12], values[6]);
+            _mm_store_pd(&x3[14], values[7]);
+          }      
+
+
+
+          x3 += 16;
+
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR GAMMA (Optimized SSE3 version for DNA data)
+
+    This is the SSE3 optimized version of ::newviewGAMMA_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b GAMMA
+    model of rate heterogeneity.
+
+    @note
+    For more details and function argument description check the function ::newviewGAMMA_FLEX
+*/
+static void newviewGTRGAMMA(int tipCase,
+                            double *x1_start, double *x2_start, double *x3_start,
+                            double *EV, double *tipVector,
+                            int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                            const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling
+                            )
+{
+  int 
+    i, 
+    j, 
+    k, 
+    l,
+    addScale = 0;
+
+  //int scaling = 0;
+
+  double
+    *x1,
+    *x2,
+    *x3,
+    max;
+  PLL_ALIGN_BEGIN double
+    maxima[2] PLL_ALIGN_END,
+    EV_t[16] PLL_ALIGN_END;
+
+  __m128d 
+    values[8],
+    EVV[8];  
+
+  for(k = 0; k < 4; k++)
+    for (l=0; l < 4; l++)
+      EV_t[4 * l + k] = EV[4 * k + l];
+
+  for(k = 0; k < 8; k++)
+    EVV[k] = _mm_load_pd(&EV_t[k * 2]);
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        double *uX1, *uX2;
+        PLL_ALIGN_BEGIN double umpX1[256] PLL_ALIGN_END, umpX2[256] PLL_ALIGN_END;
+
+
+        for (i = 1; i < 16; i++)
+        {
+          __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
+          __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));       
+
+          for (j = 0; j < 4; j++)
+
+            for (k = 0; k < 4; k++) {
+              __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
+              __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
+
+              __m128d acc = _mm_setzero_pd();
+
+              acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
+              acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+
+              acc = _mm_hadd_pd(acc, acc);
+              _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
+            }
+
+          for (j = 0; j < 4; j++)
+            for (k = 0; k < 4; k++)
+            {
+              __m128d left1 = _mm_load_pd(&right[j*16 + k*4]);
+              __m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]);
+
+              __m128d acc = _mm_setzero_pd();
+
+              acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
+              acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+
+              acc = _mm_hadd_pd(acc, acc);
+              _mm_storel_pd(&umpX2[i*16 + j*4 + k], acc);
+
+            }
+        }       
+
+        for (i = 0; i < n; i++)
+        {
+          x3 = &x3_start[i * 16];
+
+
+          uX1 = &umpX1[16 * tipX1[i]];
+          uX2 = &umpX2[16 * tipX2[i]];                      
+
+          for (j = 0; j < 4; j++)
+          {                                                                                
+            __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
+            __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+
+
+            __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
+            __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
+
+
+            //
+            // multiply left * right
+            //
+
+            __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
+            __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
+
+
+            //
+            // multiply with EV matrix (!?)
+            //
+
+            __m128d EV_t_l0_k0 = EVV[0];
+            __m128d EV_t_l0_k2 = EVV[1];
+            __m128d EV_t_l1_k0 = EVV[2];
+            __m128d EV_t_l1_k2 = EVV[3];
+            __m128d EV_t_l2_k0 = EVV[4];
+            __m128d EV_t_l2_k2 = EVV[5];
+            __m128d EV_t_l3_k0 = EVV[6]; 
+            __m128d EV_t_l3_k2 = EVV[7];
+
+            EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+            EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+            EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+            EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+            EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+            EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+            EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+            EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+            EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+            EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+            EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+            EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+            _mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 );
+            _mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 );
+          }
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      { 
+        double *uX1;
+        PLL_ALIGN_BEGIN double umpX1[256] PLL_ALIGN_END;
+
+
+        for (i = 1; i < 16; i++)
+        {
+          __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
+          __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));       
+
+          for (j = 0; j < 4; j++)
+            for (k = 0; k < 4; k++)
+            {            
+              __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
+              __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
+
+              __m128d acc = _mm_setzero_pd();
+
+              acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
+              acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+
+              acc = _mm_hadd_pd(acc, acc);
+              _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);                
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+          __m128d maxv =_mm_setzero_pd();
+
+          x2 = &x2_start[i * 16];
+          x3 = &x3_start[i * 16];
+
+          uX1 = &umpX1[16 * tipX1[i]];       
+
+          for (j = 0; j < 4; j++)
+          {
+
+            //
+            // multiply/add right side
+            //
+            double *x2_p = &x2[j*4];
+            double *right_k0_p = &right[j*16];
+            double *right_k1_p = &right[j*16 + 1*4];
+            double *right_k2_p = &right[j*16 + 2*4];
+            double *right_k3_p = &right[j*16 + 3*4];
+            __m128d x2_0 = _mm_load_pd( &x2_p[0] );
+            __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+
+            __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
+            __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
+            __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
+            __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
+            __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
+            __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
+            __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
+            __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+
+
+
+            right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+            right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+            right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+            right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+            right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+            right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+            right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+
+            right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+            right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+
+            right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+            right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+            right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+            right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+            right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
+
+            {
+              //
+              // load left side from tip vector
+              //
+
+              __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
+              __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+
+
+              //
+              // multiply left * right
+              //
+
+              __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
+              __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
+
+
+              //
+              // multiply with EV matrix (!?)
+              //                                  
+
+              __m128d EV_t_l0_k0 = EVV[0];
+              __m128d EV_t_l0_k2 = EVV[1];
+              __m128d EV_t_l1_k0 = EVV[2];
+              __m128d EV_t_l1_k2 = EVV[3];
+              __m128d EV_t_l2_k0 = EVV[4];
+              __m128d EV_t_l2_k2 = EVV[5];
+              __m128d EV_t_l3_k0 = EVV[6]; 
+              __m128d EV_t_l3_k2 = EVV[7];
+
+
+              EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+              EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+              EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+              EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+              EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+              EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+              EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+              EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+              EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+              EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+              EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+              EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+              EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+              EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+              values[j * 2]     = EV_t_l0_k0;
+              values[j * 2 + 1] = EV_t_l2_k0;                              
+
+              maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
+              maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));                  
+            }
+          }
+
+
+          _mm_store_pd(maxima, maxv);
+
+          max = PLL_MAX(maxima[0], maxima[1]);
+
+          if(max < PLL_MINLIKELIHOOD)
+          {
+            __m128d sv = _mm_set1_pd(PLL_TWOTOTHE256);
+
+            _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));       
+            _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
+            _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
+            _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
+            _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));       
+            _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
+            _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
+            _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));        
+
+             if(!fastScaling)
+               ex3[i] += 1;
+             else
+               addScale += wgt[i];
+
+          }
+          else
+          {
+            _mm_store_pd(&x3[0], values[0]);       
+            _mm_store_pd(&x3[2], values[1]);
+            _mm_store_pd(&x3[4], values[2]);
+            _mm_store_pd(&x3[6], values[3]);
+            _mm_store_pd(&x3[8], values[4]);       
+            _mm_store_pd(&x3[10], values[5]);
+            _mm_store_pd(&x3[12], values[6]);
+            _mm_store_pd(&x3[14], values[7]);
+          }
+        }
+      }
+      break;
+    case PLL_INNER_INNER:
+
+      for (i = 0; i < n; i++)
+      {
+        __m128d maxv =_mm_setzero_pd();
+
+
+        x1 = &x1_start[i * 16];
+        x2 = &x2_start[i * 16];
+        x3 = &x3_start[i * 16];
+
+        for (j = 0; j < 4; j++)
+        {
+
+          double *x1_p = &x1[j*4];
+          double *left_k0_p = &left[j*16];
+          double *left_k1_p = &left[j*16 + 1*4];
+          double *left_k2_p = &left[j*16 + 2*4];
+          double *left_k3_p = &left[j*16 + 3*4];
+
+          __m128d x1_0 = _mm_load_pd( &x1_p[0] );
+          __m128d x1_2 = _mm_load_pd( &x1_p[2] );
+
+          __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
+          __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
+          __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
+          __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
+          __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
+          __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
+          __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
+          __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
+
+          left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+          left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+          left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+          left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+          left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+          left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+          left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+          left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+          left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+          left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+
+          //
+          // multiply/add right side
+          //
+          double *x2_p = &x2[j*4];
+          double *right_k0_p = &right[j*16];
+          double *right_k1_p = &right[j*16 + 1*4];
+          double *right_k2_p = &right[j*16 + 2*4];
+          double *right_k3_p = &right[j*16 + 3*4];
+          __m128d x2_0 = _mm_load_pd( &x2_p[0] );
+          __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+
+          __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
+          __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
+          __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
+          __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
+          __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
+          __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
+          __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
+          __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+
+          right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+          right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+          right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+          right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+          right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+          right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+          right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+
+          right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+          right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+          right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);       
+
+          //
+          // multiply left * right
+          //
+
+          __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+          __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+
+
+          //
+          // multiply with EV matrix (!?)
+          //         
+
+          __m128d EV_t_l0_k0 = EVV[0];
+          __m128d EV_t_l0_k2 = EVV[1];
+          __m128d EV_t_l1_k0 = EVV[2];
+          __m128d EV_t_l1_k2 = EVV[3];
+          __m128d EV_t_l2_k0 = EVV[4];
+          __m128d EV_t_l2_k2 = EVV[5];
+          __m128d EV_t_l3_k0 = EVV[6]; 
+          __m128d EV_t_l3_k2 = EVV[7];
+
+
+          EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+          EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+          EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+          EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+          EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+          EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+          EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+
+          EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+          EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+          EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+
+          values[j * 2] = EV_t_l0_k0;
+          values[j * 2 + 1] = EV_t_l2_k0;                           
+
+          maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
+          maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
+        }
+
+
+        _mm_store_pd(maxima, maxv);
+
+        max = PLL_MAX(maxima[0], maxima[1]);
+
+        if(max < PLL_MINLIKELIHOOD)
+        {
+          __m128d sv = _mm_set1_pd(PLL_TWOTOTHE256);
+
+          _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));         
+          _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
+          _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
+          _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
+          _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));         
+          _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
+          _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
+          _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));          
+
+           if(!fastScaling)
+             ex3[i] += 1;
+           else
+             addScale += wgt[i];        
+        }
+        else
+        {
+          _mm_store_pd(&x3[0], values[0]);         
+          _mm_store_pd(&x3[2], values[1]);
+          _mm_store_pd(&x3[4], values[2]);
+          _mm_store_pd(&x3[6], values[3]);
+          _mm_store_pd(&x3[8], values[4]);         
+          _mm_store_pd(&x3[10], values[5]);
+          _mm_store_pd(&x3[12], values[6]);
+          _mm_store_pd(&x3[14], values[7]);
+        }        
+      }
+
+      break;
+    default:
+      assert(0);
+  }
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR CAT (Optimized SSE3 version for DNA data)
+
+    This is the SSE3 optimized version of ::newviewCAT_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b CAT
+    model of rate heterogeneity.
+
+    @note
+    For more details and function argument description check the function ::newviewCAT_FLEX
+*/
+static void newviewGTRCAT( int tipCase,  double *EV,  int *cptr,
+                           double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+                           int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                           int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling)
+{
+  double
+    *le,
+    *ri,
+    *x1,
+    *x2, 
+    *x3;
+  PLL_ALIGN_BEGIN double
+    EV_t[16] PLL_ALIGN_END;
+
+  int 
+    i, 
+    j, 
+    scale, 
+    addScale = 0;
+
+  __m128d
+    minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD ),
+                      sc = _mm_set1_pd(PLL_TWOTOTHE256),
+                      EVV[8];  
+
+  for(i = 0; i < 4; i++)
+    for (j=0; j < 4; j++)
+      EV_t[4 * j + i] = EV[4 * i + j];
+
+  for(i = 0; i < 8; i++)
+    EVV[i] = _mm_load_pd(&EV_t[i * 2]);
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:      
+      for (i = 0; i < n; i++)
+      {  
+        x1 = &(tipVector[4 * tipX1[i]]);
+        x2 = &(tipVector[4 * tipX2[i]]);
+
+        x3 = &x3_start[i * 4];
+
+        le =  &left[cptr[i] * 16];
+        ri =  &right[cptr[i] * 16];
+
+        __m128d x1_0 = _mm_load_pd( &x1[0] );
+        __m128d x1_2 = _mm_load_pd( &x1[2] );
+
+        __m128d left_k0_0 = _mm_load_pd( &le[0] );
+        __m128d left_k0_2 = _mm_load_pd( &le[2] );
+        __m128d left_k1_0 = _mm_load_pd( &le[4] );
+        __m128d left_k1_2 = _mm_load_pd( &le[6] );
+        __m128d left_k2_0 = _mm_load_pd( &le[8] );
+        __m128d left_k2_2 = _mm_load_pd( &le[10] );
+        __m128d left_k3_0 = _mm_load_pd( &le[12] );
+        __m128d left_k3_2 = _mm_load_pd( &le[14] );
+
+        left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+        left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+        left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+        left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+        left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+        left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+        left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+        left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+        left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+        left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+        left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+        left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+        left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+        left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+        __m128d x2_0 = _mm_load_pd( &x2[0] );
+        __m128d x2_2 = _mm_load_pd( &x2[2] );
+
+        __m128d right_k0_0 = _mm_load_pd( &ri[0] );
+        __m128d right_k0_2 = _mm_load_pd( &ri[2] );
+        __m128d right_k1_0 = _mm_load_pd( &ri[4] );
+        __m128d right_k1_2 = _mm_load_pd( &ri[6] );
+        __m128d right_k2_0 = _mm_load_pd( &ri[8] );
+        __m128d right_k2_2 = _mm_load_pd( &ri[10] );
+        __m128d right_k3_0 = _mm_load_pd( &ri[12] );
+        __m128d right_k3_2 = _mm_load_pd( &ri[14] );
+
+        right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+        right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+        right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+        right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+        right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+        right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+        right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+        right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+        right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+        right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+        right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+        right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+        right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+        right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);         
+
+        __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+        __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );           
+
+        __m128d EV_t_l0_k0 = EVV[0];
+        __m128d EV_t_l0_k2 = EVV[1];
+        __m128d EV_t_l1_k0 = EVV[2];
+        __m128d EV_t_l1_k2 = EVV[3];
+        __m128d EV_t_l2_k0 = EVV[4];
+        __m128d EV_t_l2_k2 = EVV[5];
+        __m128d EV_t_l3_k0 = EVV[6];
+        __m128d EV_t_l3_k2 = EVV[7];
+
+        EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+        EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+        EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+        EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+        EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+        EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+        EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+        EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+        EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+        EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+        EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+        EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+        EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+        EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );      
+
+        _mm_store_pd(x3, EV_t_l0_k0);
+        _mm_store_pd(&x3[2], EV_t_l2_k0);                                   
+      }
+      break;
+    case PLL_TIP_INNER:      
+      for (i = 0; i < n; i++)
+      {
+        x1 = &(tipVector[4 * tipX1[i]]);
+        x2 = &x2_start[4 * i];
+        x3 = &x3_start[4 * i];
+
+        le =  &left[cptr[i] * 16];
+        ri =  &right[cptr[i] * 16];
+
+        __m128d x1_0 = _mm_load_pd( &x1[0] );
+        __m128d x1_2 = _mm_load_pd( &x1[2] );
+
+        __m128d left_k0_0 = _mm_load_pd( &le[0] );
+        __m128d left_k0_2 = _mm_load_pd( &le[2] );
+        __m128d left_k1_0 = _mm_load_pd( &le[4] );
+        __m128d left_k1_2 = _mm_load_pd( &le[6] );
+        __m128d left_k2_0 = _mm_load_pd( &le[8] );
+        __m128d left_k2_2 = _mm_load_pd( &le[10] );
+        __m128d left_k3_0 = _mm_load_pd( &le[12] );
+        __m128d left_k3_2 = _mm_load_pd( &le[14] );
+
+        left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+        left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+        left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+        left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+        left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+        left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+        left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+        left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+        left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+        left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+        left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+        left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+        left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+        left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+        __m128d x2_0 = _mm_load_pd( &x2[0] );
+        __m128d x2_2 = _mm_load_pd( &x2[2] );
+
+        __m128d right_k0_0 = _mm_load_pd( &ri[0] );
+        __m128d right_k0_2 = _mm_load_pd( &ri[2] );
+        __m128d right_k1_0 = _mm_load_pd( &ri[4] );
+        __m128d right_k1_2 = _mm_load_pd( &ri[6] );
+        __m128d right_k2_0 = _mm_load_pd( &ri[8] );
+        __m128d right_k2_2 = _mm_load_pd( &ri[10] );
+        __m128d right_k3_0 = _mm_load_pd( &ri[12] );
+        __m128d right_k3_2 = _mm_load_pd( &ri[14] );
+
+        right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+        right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+        right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+        right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+        right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+        right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+        right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+        right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+        right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+        right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+        right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+        right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+        right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+        right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);         
+
+        __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+        __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+
+        __m128d EV_t_l0_k0 = EVV[0];
+        __m128d EV_t_l0_k2 = EVV[1];
+        __m128d EV_t_l1_k0 = EVV[2];
+        __m128d EV_t_l1_k2 = EVV[3];
+        __m128d EV_t_l2_k0 = EVV[4];
+        __m128d EV_t_l2_k2 = EVV[5];
+        __m128d EV_t_l3_k0 = EVV[6];
+        __m128d EV_t_l3_k2 = EVV[7];
+
+
+        EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+        EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+        EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+        EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+        EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+        EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+        EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+        EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+        EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+        EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+        EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+        EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+        EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+        EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );                                       
+
+        scale = 1;
+
+        __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
+        v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+        if(_mm_movemask_pd( v1 ) != 3)
+          scale = 0;
+        else
+        {
+          v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
+          v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+          if(_mm_movemask_pd( v1 ) != 3)
+            scale = 0;
+        }
+
+        if(scale)
+        {                     
+          _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
+          _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));                   
+
+           if(!fastScaling)
+             ex3[i] += 1;
+           else
+             addScale += wgt[i];          
+        }       
+        else
+        {
+          _mm_store_pd(x3, EV_t_l0_k0);
+          _mm_store_pd(&x3[2], EV_t_l2_k0);
+        }
+
+
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        x1 = &x1_start[4 * i];
+        x2 = &x2_start[4 * i];
+        x3 = &x3_start[4 * i];
+
+        le =  &left[cptr[i] * 16];
+        ri =  &right[cptr[i] * 16];
+
+        __m128d x1_0 = _mm_load_pd( &x1[0] );
+        __m128d x1_2 = _mm_load_pd( &x1[2] );
+
+        __m128d left_k0_0 = _mm_load_pd( &le[0] );
+        __m128d left_k0_2 = _mm_load_pd( &le[2] );
+        __m128d left_k1_0 = _mm_load_pd( &le[4] );
+        __m128d left_k1_2 = _mm_load_pd( &le[6] );
+        __m128d left_k2_0 = _mm_load_pd( &le[8] );
+        __m128d left_k2_2 = _mm_load_pd( &le[10] );
+        __m128d left_k3_0 = _mm_load_pd( &le[12] );
+        __m128d left_k3_2 = _mm_load_pd( &le[14] );
+
+        left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+        left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+        left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+        left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+        left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+        left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+        left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+        left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+        left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+        left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+        left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+        left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+        left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+        left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+        __m128d x2_0 = _mm_load_pd( &x2[0] );
+        __m128d x2_2 = _mm_load_pd( &x2[2] );
+
+        __m128d right_k0_0 = _mm_load_pd( &ri[0] );
+        __m128d right_k0_2 = _mm_load_pd( &ri[2] );
+        __m128d right_k1_0 = _mm_load_pd( &ri[4] );
+        __m128d right_k1_2 = _mm_load_pd( &ri[6] );
+        __m128d right_k2_0 = _mm_load_pd( &ri[8] );
+        __m128d right_k2_2 = _mm_load_pd( &ri[10] );
+        __m128d right_k3_0 = _mm_load_pd( &ri[12] );
+        __m128d right_k3_2 = _mm_load_pd( &ri[14] );
+
+        right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+        right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+        right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+        right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+        right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+        right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+        right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+        right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+        right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+        right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+        right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+        right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+        right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+        right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);         
+
+        __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+        __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+
+        __m128d EV_t_l0_k0 = EVV[0];
+        __m128d EV_t_l0_k2 = EVV[1];
+        __m128d EV_t_l1_k0 = EVV[2];
+        __m128d EV_t_l1_k2 = EVV[3];
+        __m128d EV_t_l2_k0 = EVV[4];
+        __m128d EV_t_l2_k2 = EVV[5];
+        __m128d EV_t_l3_k0 = EVV[6];
+        __m128d EV_t_l3_k2 = EVV[7];
+
+
+        EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+        EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+        EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+        EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+        EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+        EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+        EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+        EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+        EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+        EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+        EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+        EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+        EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+        EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );                                              
+
+        scale = 1;
+
+        __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
+        v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+        if(_mm_movemask_pd( v1 ) != 3)
+          scale = 0;
+        else
+        {
+          v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
+          v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+          if(_mm_movemask_pd( v1 ) != 3)
+            scale = 0;
+        }
+
+        if(scale)
+        {                     
+          _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
+          _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));                   
+
+          if(!fastScaling)
+            ex3[i] += 1;
+          else
+            addScale += wgt[i];   
+        }       
+        else
+        {
+          _mm_store_pd(x3, EV_t_l0_k0);
+          _mm_store_pd(&x3[2], EV_t_l2_k0);
+        }
+
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+#endif
+
+/** @brief Check whether the position \a pos in bitvector \a x is a gap
+    
+    @param x
+      A bitvector represented by unsigned integers
+
+    @param pos
+      Position to check in \a x if it is set (i.e. it is a gap) 
+
+    @return
+      Returns the value of the bit vector (\b 1 if set, \b 0 if not)
+*/
+//#ifndef __clang__
+//__inline
+//#endif
+pllBoolean isGap(unsigned int *x, int pos)
+{
+  return (x[pos / 32] & mask32[pos % 32]);
+}
+
+/** @brief Check whether the position \a pos in bitvector \a x is \b NOT a gap
+    
+    @param x
+      A bitvector represented by unsigned integers
+
+    @param pos
+      Position to check in \a x if it is \b NOT set (i.e. it is \b NOT a gap) 
+
+    @return
+      Returns the value of the bit vector (\b 1 if set, \b 0 if not)
+*/
+//#ifndef __clang__
+//__inline
+//#endif
+pllBoolean noGap(unsigned int *x, int pos)
+{
+  return (!(x[pos / 32] & mask32[pos % 32]));
+}
+
+#if (!defined(__AVX) && defined(__SSE3))
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR CAT with memory saving (Optimized SSE3 version for DNA data)
+
+    This is the SSE3 optimized version of ::newviewCAT_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b CAT
+    model of rate heterogeneity. The memory saving technique is incorporated.
+
+    @note
+    For more details and function argument description check the function ::newviewCAT_FLEX
+*/
+static void newviewGTRCAT_SAVE( int tipCase,  double *EV,  int *cptr,
+                                double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+                                int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats)
+{
+  double
+    *le,
+    *ri,
+    *x1,
+    *x2,
+    *x3,
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start, 
+    *x3_ptr = x3_start;
+  PLL_ALIGN_BEGIN double
+    EV_t[16] PLL_ALIGN_END;
+
+  int 
+    i, 
+    j, 
+    scale, 
+    scaleGap = 0,
+    addScale = 0;
+
+  __m128d
+    minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD ),
+                      sc = _mm_set1_pd(PLL_TWOTOTHE256),
+                      EVV[8];  
+
+  for(i = 0; i < 4; i++)
+    for (j=0; j < 4; j++)
+      EV_t[4 * j + i] = EV[4 * i + j];
+
+  for(i = 0; i < 8; i++)
+    EVV[i] = _mm_load_pd(&EV_t[i * 2]);
+
+  {
+    x1 = x1_gapColumn;        
+    x2 = x2_gapColumn;
+    x3 = x3_gapColumn;
+
+    le =  &left[maxCats * 16];           
+    ri =  &right[maxCats * 16];                                                  
+
+    __m128d x1_0 = _mm_load_pd( &x1[0] );
+    __m128d x1_2 = _mm_load_pd( &x1[2] );
+
+    __m128d left_k0_0 = _mm_load_pd( &le[0] );
+    __m128d left_k0_2 = _mm_load_pd( &le[2] );
+    __m128d left_k1_0 = _mm_load_pd( &le[4] );
+    __m128d left_k1_2 = _mm_load_pd( &le[6] );
+    __m128d left_k2_0 = _mm_load_pd( &le[8] );
+    __m128d left_k2_2 = _mm_load_pd( &le[10] );
+    __m128d left_k3_0 = _mm_load_pd( &le[12] );
+    __m128d left_k3_2 = _mm_load_pd( &le[14] );
+
+    left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+    left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+    left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+    left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+    left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+    left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+    left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+    left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+    left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+    left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+    __m128d x2_0 = _mm_load_pd( &x2[0] );
+    __m128d x2_2 = _mm_load_pd( &x2[2] );
+
+    __m128d right_k0_0 = _mm_load_pd( &ri[0] );
+    __m128d right_k0_2 = _mm_load_pd( &ri[2] );
+    __m128d right_k1_0 = _mm_load_pd( &ri[4] );
+    __m128d right_k1_2 = _mm_load_pd( &ri[6] );
+    __m128d right_k2_0 = _mm_load_pd( &ri[8] );
+    __m128d right_k2_2 = _mm_load_pd( &ri[10] );
+    __m128d right_k3_0 = _mm_load_pd( &ri[12] );
+    __m128d right_k3_2 = _mm_load_pd( &ri[14] );
+
+    right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+    right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+    right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+    right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+    right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+    right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+    right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+    right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+    right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+    right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);     
+
+    __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+    __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+
+    __m128d EV_t_l0_k0 = EVV[0];
+    __m128d EV_t_l0_k2 = EVV[1];
+    __m128d EV_t_l1_k0 = EVV[2];
+    __m128d EV_t_l1_k2 = EVV[3];
+    __m128d EV_t_l2_k0 = EVV[4];
+    __m128d EV_t_l2_k2 = EVV[5];
+    __m128d EV_t_l3_k0 = EVV[6];
+    __m128d EV_t_l3_k2 = EVV[7];
+
+    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+    EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+    EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );                                   
+
+    if(tipCase != PLL_TIP_TIP)
+    {    
+      scale = 1;
+
+      __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
+      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+      if(_mm_movemask_pd( v1 ) != 3)
+        scale = 0;
+      else
+      {
+        v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
+        v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+        if(_mm_movemask_pd( v1 ) != 3)
+          scale = 0;
+      }
+
+      if(scale)
+      {               
+        _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
+        _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));                     
+
+        scaleGap = PLL_TRUE;       
+      } 
+      else
+      {
+        _mm_store_pd(x3, EV_t_l0_k0);
+        _mm_store_pd(&x3[2], EV_t_l2_k0);
+      }
+    }
+    else
+    {
+      _mm_store_pd(x3, EV_t_l0_k0);
+      _mm_store_pd(&x3[2], EV_t_l2_k0);
+    }
+  }
+
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:      
+      for (i = 0; i < n; i++)
+      {
+        if(noGap(x3_gap, i))
+        {
+          x1 = &(tipVector[4 * tipX1[i]]);
+          x2 = &(tipVector[4 * tipX2[i]]);
+
+          x3 = x3_ptr;
+
+          if(isGap(x1_gap, i))
+            le =  &left[maxCats * 16];
+          else            
+            le =  &left[cptr[i] * 16];    
+
+          if(isGap(x2_gap, i))
+            ri =  &right[maxCats * 16];
+          else            
+            ri =  &right[cptr[i] * 16];
+
+          __m128d x1_0 = _mm_load_pd( &x1[0] );
+          __m128d x1_2 = _mm_load_pd( &x1[2] );
+
+          __m128d left_k0_0 = _mm_load_pd( &le[0] );
+          __m128d left_k0_2 = _mm_load_pd( &le[2] );
+          __m128d left_k1_0 = _mm_load_pd( &le[4] );
+          __m128d left_k1_2 = _mm_load_pd( &le[6] );
+          __m128d left_k2_0 = _mm_load_pd( &le[8] );
+          __m128d left_k2_2 = _mm_load_pd( &le[10] );
+          __m128d left_k3_0 = _mm_load_pd( &le[12] );
+          __m128d left_k3_2 = _mm_load_pd( &le[14] );
+
+          left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+          left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+          left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+          left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+          left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+          left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+          left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+          left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+          left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+          left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+          __m128d x2_0 = _mm_load_pd( &x2[0] );
+          __m128d x2_2 = _mm_load_pd( &x2[2] );
+
+          __m128d right_k0_0 = _mm_load_pd( &ri[0] );
+          __m128d right_k0_2 = _mm_load_pd( &ri[2] );
+          __m128d right_k1_0 = _mm_load_pd( &ri[4] );
+          __m128d right_k1_2 = _mm_load_pd( &ri[6] );
+          __m128d right_k2_0 = _mm_load_pd( &ri[8] );
+          __m128d right_k2_2 = _mm_load_pd( &ri[10] );
+          __m128d right_k3_0 = _mm_load_pd( &ri[12] );
+          __m128d right_k3_2 = _mm_load_pd( &ri[14] );
+
+          right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+          right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+          right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+          right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+          right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+          right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+          right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+          right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+          right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+          right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);       
+
+          __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+          __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );                 
+
+          __m128d EV_t_l0_k0 = EVV[0];
+          __m128d EV_t_l0_k2 = EVV[1];
+          __m128d EV_t_l1_k0 = EVV[2];
+          __m128d EV_t_l1_k2 = EVV[3];
+          __m128d EV_t_l2_k0 = EVV[4];
+          __m128d EV_t_l2_k2 = EVV[5];
+          __m128d EV_t_l3_k0 = EVV[6];
+          __m128d EV_t_l3_k2 = EVV[7];
+
+          EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+          EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+          EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+          EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+          EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+          EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+          EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+          EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+          EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+          EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );    
+
+          _mm_store_pd(x3, EV_t_l0_k0);
+          _mm_store_pd(&x3[2], EV_t_l2_k0);                                 
+
+          x3_ptr += 4;
+        }
+      }
+      break;
+    case PLL_TIP_INNER:      
+      for (i = 0; i < n; i++)
+      { 
+        if(isGap(x3_gap, i))
+        {
+          if(scaleGap)
+            {
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];
+            }
+        }
+        else
+        {             
+          x1 = &(tipVector[4 * tipX1[i]]);
+
+          x2 = x2_ptr;
+          x3 = x3_ptr;
+
+          if(isGap(x1_gap, i))
+            le =  &left[maxCats * 16];
+          else
+            le =  &left[cptr[i] * 16];
+
+          if(isGap(x2_gap, i))
+          {              
+            ri =  &right[maxCats * 16];
+            x2 = x2_gapColumn;
+          }
+          else
+          {
+            ri =  &right[cptr[i] * 16];
+            x2 = x2_ptr;
+            x2_ptr += 4;
+          }                               
+
+          __m128d x1_0 = _mm_load_pd( &x1[0] );
+          __m128d x1_2 = _mm_load_pd( &x1[2] );
+
+          __m128d left_k0_0 = _mm_load_pd( &le[0] );
+          __m128d left_k0_2 = _mm_load_pd( &le[2] );
+          __m128d left_k1_0 = _mm_load_pd( &le[4] );
+          __m128d left_k1_2 = _mm_load_pd( &le[6] );
+          __m128d left_k2_0 = _mm_load_pd( &le[8] );
+          __m128d left_k2_2 = _mm_load_pd( &le[10] );
+          __m128d left_k3_0 = _mm_load_pd( &le[12] );
+          __m128d left_k3_2 = _mm_load_pd( &le[14] );
+
+          left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+          left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+          left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+          left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+          left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+          left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+          left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+          left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+          left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+          left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+          __m128d x2_0 = _mm_load_pd( &x2[0] );
+          __m128d x2_2 = _mm_load_pd( &x2[2] );
+
+          __m128d right_k0_0 = _mm_load_pd( &ri[0] );
+          __m128d right_k0_2 = _mm_load_pd( &ri[2] );
+          __m128d right_k1_0 = _mm_load_pd( &ri[4] );
+          __m128d right_k1_2 = _mm_load_pd( &ri[6] );
+          __m128d right_k2_0 = _mm_load_pd( &ri[8] );
+          __m128d right_k2_2 = _mm_load_pd( &ri[10] );
+          __m128d right_k3_0 = _mm_load_pd( &ri[12] );
+          __m128d right_k3_2 = _mm_load_pd( &ri[14] );
+
+          right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+          right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+          right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+          right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+          right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+          right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+          right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+          right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+          right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+          right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);       
+
+          __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+          __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+
+          __m128d EV_t_l0_k0 = EVV[0];
+          __m128d EV_t_l0_k2 = EVV[1];
+          __m128d EV_t_l1_k0 = EVV[2];
+          __m128d EV_t_l1_k2 = EVV[3];
+          __m128d EV_t_l2_k0 = EVV[4];
+          __m128d EV_t_l2_k2 = EVV[5];
+          __m128d EV_t_l3_k0 = EVV[6];
+          __m128d EV_t_l3_k2 = EVV[7];
+
+
+          EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+          EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+          EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+          EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+          EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+          EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+          EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+          EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+          EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+          EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );                                     
+
+          scale = 1;
+
+          __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
+          v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+          if(_mm_movemask_pd( v1 ) != 3)
+            scale = 0;
+          else
+          {
+            v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
+            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(_mm_movemask_pd( v1 ) != 3)
+              scale = 0;
+          }
+
+          if(scale)
+          {                   
+            _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
+            _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));                 
+            
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];         
+          }     
+          else
+          {
+            _mm_store_pd(x3, EV_t_l0_k0);
+            _mm_store_pd(&x3[2], EV_t_l2_k0);
+          }
+
+          x3_ptr += 4;
+        }
+
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      { 
+        if(isGap(x3_gap, i))
+        {
+          if(scaleGap)
+            {
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];
+            }
+        }
+        else
+        {            
+          x3 = x3_ptr;
+
+          if(isGap(x1_gap, i))
+          {
+            x1 = x1_gapColumn;
+            le =  &left[maxCats * 16];
+          }
+          else
+          {
+            le =  &left[cptr[i] * 16];
+            x1 = x1_ptr;
+            x1_ptr += 4;
+          }
+
+          if(isGap(x2_gap, i))  
+          {
+            x2 = x2_gapColumn;
+            ri =  &right[maxCats * 16];     
+          }
+          else
+          {
+            ri =  &right[cptr[i] * 16];
+            x2 = x2_ptr;
+            x2_ptr += 4;
+          }                               
+
+          __m128d x1_0 = _mm_load_pd( &x1[0] );
+          __m128d x1_2 = _mm_load_pd( &x1[2] );
+
+          __m128d left_k0_0 = _mm_load_pd( &le[0] );
+          __m128d left_k0_2 = _mm_load_pd( &le[2] );
+          __m128d left_k1_0 = _mm_load_pd( &le[4] );
+          __m128d left_k1_2 = _mm_load_pd( &le[6] );
+          __m128d left_k2_0 = _mm_load_pd( &le[8] );
+          __m128d left_k2_2 = _mm_load_pd( &le[10] );
+          __m128d left_k3_0 = _mm_load_pd( &le[12] );
+          __m128d left_k3_2 = _mm_load_pd( &le[14] );
+
+          left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+          left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+          left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+          left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+          left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+          left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+          left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+          left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+          left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+          left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+          __m128d x2_0 = _mm_load_pd( &x2[0] );
+          __m128d x2_2 = _mm_load_pd( &x2[2] );
+
+          __m128d right_k0_0 = _mm_load_pd( &ri[0] );
+          __m128d right_k0_2 = _mm_load_pd( &ri[2] );
+          __m128d right_k1_0 = _mm_load_pd( &ri[4] );
+          __m128d right_k1_2 = _mm_load_pd( &ri[6] );
+          __m128d right_k2_0 = _mm_load_pd( &ri[8] );
+          __m128d right_k2_2 = _mm_load_pd( &ri[10] );
+          __m128d right_k3_0 = _mm_load_pd( &ri[12] );
+          __m128d right_k3_2 = _mm_load_pd( &ri[14] );
+
+          right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+          right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+          right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+          right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+          right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+          right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+          right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+          right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+          right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+          right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);       
+
+          __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+          __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+
+          __m128d EV_t_l0_k0 = EVV[0];
+          __m128d EV_t_l0_k2 = EVV[1];
+          __m128d EV_t_l1_k0 = EVV[2];
+          __m128d EV_t_l1_k2 = EVV[3];
+          __m128d EV_t_l2_k0 = EVV[4];
+          __m128d EV_t_l2_k2 = EVV[5];
+          __m128d EV_t_l3_k0 = EVV[6];
+          __m128d EV_t_l3_k2 = EVV[7];
+
+
+          EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+          EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+          EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+          EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+          EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+          EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+          EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+          EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+          EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+          EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );                                            
+
+          scale = 1;
+
+          __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
+          v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+          if(_mm_movemask_pd( v1 ) != 3)
+            scale = 0;
+          else
+          {
+            v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
+            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(_mm_movemask_pd( v1 ) != 3)
+              scale = 0;
+          }
+
+          if(scale)
+          {                   
+            _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
+            _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));                 
+
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];         
+          }     
+          else
+          {
+            _mm_store_pd(x3, EV_t_l0_k0);
+            _mm_store_pd(&x3[2], EV_t_l2_k0);
+          }
+
+          x3_ptr += 4;
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR GAMMA with memory saving (Optimized SSE3 version for AA data)
+
+    This is the SSE3 optimized version of ::newviewGAMMA_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b GAMMA
+    model of rate heterogeneity. The memory saving technique is incorporated.
+
+    @note
+    For more details and function argument description check the function ::newviewGAMMA_FLEX
+*/
+static void newviewGTRGAMMAPROT_GAPPED_SAVE(int tipCase,
+                                            double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                                            int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                            int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                            unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,  
+                                            double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn
+                                            )
+{
+  double  *uX1, *uX2, *v;
+  double x1px2;
+  int  i, j, l, k, scale, addScale = 0,   
+       gapScaling = 0;
+  double 
+    *vl, *vr, *x1v, *x2v,
+    *x1_ptr = x1,
+    *x2_ptr = x2,
+    *x3_ptr = x3;
+
+
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        double umpX1[1840], umpX2[1840];
+
+        for(i = 0; i < 23; i++)
+        {
+          v = &(tipVector[20 * i]);
+
+          for(k = 0; k < 80; k++)
+          {
+            double *ll =  &left[k * 20];
+            double *rr =  &right[k * 20];
+
+            __m128d umpX1v = _mm_setzero_pd();
+            __m128d umpX2v = _mm_setzero_pd();
+
+            for(l = 0; l < 20; l+=2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
+              umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));                                 
+            }
+
+            umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
+            umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
+
+            _mm_storel_pd(&umpX1[80 * i + k], umpX1v);
+            _mm_storel_pd(&umpX2[80 * i + k], umpX2v);
+          }
+        }
+
+        {
+          uX1 = &umpX1[1760];
+          uX2 = &umpX2[1760];
+
+          for(j = 0; j < 4; j++)
+          {
+            v = &x3_gapColumn[j * 20];
+
+            __m128d zero =  _mm_setzero_pd();
+            for(k = 0; k < 20; k+=2)                                
+              _mm_store_pd(&v[k], zero);
+
+            for(k = 0; k < 20; k++)
+            { 
+              double *eev = &extEV[k * 20];
+              x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+              __m128d x1px2v = _mm_set1_pd(x1px2);
+
+              for(l = 0; l < 20; l+=2)
+              {
+                __m128d vv = _mm_load_pd(&v[l]);
+                __m128d ee = _mm_load_pd(&eev[l]);
+
+                vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+
+                _mm_store_pd(&v[l], vv);
+              }
+            }
+          }        
+        }       
+
+        for(i = 0; i < n; i++)
+        {
+          if(!(x3_gap[i / 32] & mask32[i % 32]))
+          {
+            uX1 = &umpX1[80 * tipX1[i]];
+            uX2 = &umpX2[80 * tipX2[i]];
+
+            for(j = 0; j < 4; j++)
+            {
+              v = &x3_ptr[j * 20];
+
+
+              __m128d zero =  _mm_setzero_pd();
+              for(k = 0; k < 20; k+=2)                              
+                _mm_store_pd(&v[k], zero);
+
+              for(k = 0; k < 20; k++)
+              { 
+                double *eev = &extEV[k * 20];
+                x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+                __m128d x1px2v = _mm_set1_pd(x1px2);
+
+                for(l = 0; l < 20; l+=2)
+                {
+                  __m128d vv = _mm_load_pd(&v[l]);
+                  __m128d ee = _mm_load_pd(&eev[l]);
+
+                  vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+
+                  _mm_store_pd(&v[l], vv);
+                }
+              }
+            }      
+            x3_ptr += 80;
+          }
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        double umpX1[1840], ump_x2[20];
+
+
+        for(i = 0; i < 23; i++)
+        {
+          v = &(tipVector[20 * i]);
+
+          for(k = 0; k < 80; k++)
+          {
+            double *ll =  &left[k * 20];
+
+            __m128d umpX1v = _mm_setzero_pd();
+
+            for(l = 0; l < 20; l+=2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));                                                 
+            }
+
+            umpX1v = _mm_hadd_pd(umpX1v, umpX1v);                               
+            _mm_storel_pd(&umpX1[80 * i + k], umpX1v);          
+
+          }
+        }
+
+        {
+          uX1 = &umpX1[1760];
+
+          for(k = 0; k < 4; k++)
+          {
+            v = &(x2_gapColumn[k * 20]);
+
+            for(l = 0; l < 20; l++)
+            {              
+              double *r =  &right[k * 400 + l * 20];
+              __m128d ump_x2v = _mm_setzero_pd();           
+
+              for(j = 0; j < 20; j+= 2)
+              {
+                __m128d vv = _mm_load_pd(&v[j]);
+                __m128d rr = _mm_load_pd(&r[j]);
+                ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
+              }
+
+              ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+
+              _mm_storel_pd(&ump_x2[l], ump_x2v);                                    
+            }
+
+            v = &(x3_gapColumn[20 * k]);
+
+            __m128d zero =  _mm_setzero_pd();
+            for(l = 0; l < 20; l+=2)                                
+              _mm_store_pd(&v[l], zero);
+
+            for(l = 0; l < 20; l++)
+            {
+              double *eev = &extEV[l * 20];
+              x1px2 = uX1[k * 20 + l]  * ump_x2[l];
+              __m128d x1px2v = _mm_set1_pd(x1px2);
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d vv = _mm_load_pd(&v[j]);
+                __m128d ee = _mm_load_pd(&eev[j]);
+
+                vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+
+                _mm_store_pd(&v[j], vv);
+              }                             
+            }                   
+
+          }
+
+          { 
+            v = x3_gapColumn;
+            __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+            scale = 1;
+            for(l = 0; scale && (l < 80); l += 2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              __m128d v1 = _mm_and_pd(vv, absMask.m);
+              v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+              if(_mm_movemask_pd( v1 ) != 3)
+                scale = 0;
+            }             
+          }
+
+
+          if (scale)
+          {
+            gapScaling = 1;
+            __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+            for(l = 0; l < 80; l+=2)
+            {
+              __m128d ex3v = _mm_load_pd(&v[l]);                  
+              _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));      
+            }                                                          
+          }
+        }
+
+        for (i = 0; i < n; i++)
+        {           
+          if((x3_gap[i / 32] & mask32[i % 32]))
+          {            
+            if(gapScaling)
+            {   
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];                  
+            }
+          }
+          else
+          {
+            uX1 = &umpX1[80 * tipX1[i]];
+
+            if(x2_gap[i / 32] & mask32[i % 32])
+              x2v = x2_gapColumn;
+            else
+            {
+              x2v = x2_ptr;
+              x2_ptr += 80;
+            }
+
+            for(k = 0; k < 4; k++)
+            {
+              v = &(x2v[k * 20]);
+
+              for(l = 0; l < 20; l++)
+              {            
+                double *r =  &right[k * 400 + l * 20];
+                __m128d ump_x2v = _mm_setzero_pd();         
+
+                for(j = 0; j < 20; j+= 2)
+                {
+                  __m128d vv = _mm_load_pd(&v[j]);
+                  __m128d rr = _mm_load_pd(&r[j]);
+                  ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
+                }
+
+                ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+
+                _mm_storel_pd(&ump_x2[l], ump_x2v);                                  
+              }
+
+              v = &x3_ptr[20 * k];
+
+              __m128d zero =  _mm_setzero_pd();
+              for(l = 0; l < 20; l+=2)                              
+                _mm_store_pd(&v[l], zero);
+
+              for(l = 0; l < 20; l++)
+              {
+                double *eev = &extEV[l * 20];
+                x1px2 = uX1[k * 20 + l]  * ump_x2[l];
+                __m128d x1px2v = _mm_set1_pd(x1px2);
+
+                for(j = 0; j < 20; j+=2)
+                {
+                  __m128d vv = _mm_load_pd(&v[j]);
+                  __m128d ee = _mm_load_pd(&eev[j]);
+
+                  vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+
+                  _mm_store_pd(&v[j], vv);
+                }                                   
+              }                 
+
+            }
+
+
+            { 
+              v = x3_ptr;
+              __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+              scale = 1;
+              for(l = 0; scale && (l < 80); l += 2)
+              {
+                __m128d vv = _mm_load_pd(&v[l]);
+                __m128d v1 = _mm_and_pd(vv, absMask.m);
+                v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+                if(_mm_movemask_pd( v1 ) != 3)
+                  scale = 0;
+              }           
+            }
+
+
+            if (scale)
+            {
+              __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+              for(l = 0; l < 80; l+=2)
+              {
+                __m128d ex3v = _mm_load_pd(&v[l]);                
+                _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));    
+              }                           
+              
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];                   
+            }
+
+            x3_ptr += 80;
+          }
+        }
+      }
+      break;
+    case PLL_INNER_INNER:
+      {
+        for(k = 0; k < 4; k++)
+        {
+          vl = &(x1_gapColumn[20 * k]);
+          vr = &(x2_gapColumn[20 * k]);
+          v =  &(x3_gapColumn[20 * k]);
+
+          __m128d zero =  _mm_setzero_pd();
+          for(l = 0; l < 20; l+=2)                                  
+            _mm_store_pd(&v[l], zero);
+
+          for(l = 0; l < 20; l++)
+          {              
+            {
+              __m128d al = _mm_setzero_pd();
+              __m128d ar = _mm_setzero_pd();
+
+              double *ll   = &left[k * 400 + l * 20];
+              double *rr   = &right[k * 400 + l * 20];
+              double *EVEV = &extEV[20 * l];
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d lv  = _mm_load_pd(&ll[j]);
+                __m128d rv  = _mm_load_pd(&rr[j]);
+                __m128d vll = _mm_load_pd(&vl[j]);
+                __m128d vrr = _mm_load_pd(&vr[j]);
+
+                al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
+                ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
+              }                  
+
+              al = _mm_hadd_pd(al, al);
+              ar = _mm_hadd_pd(ar, ar);
+
+              al = _mm_mul_pd(al, ar);
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d vv  = _mm_load_pd(&v[j]);
+                __m128d EVV = _mm_load_pd(&EVEV[j]);
+
+                vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+
+                _mm_store_pd(&v[j], vv);
+              }                                           
+            }            
+
+          }
+        }
+
+
+        { 
+          v = x3_gapColumn;
+          __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+          scale = 1;
+          for(l = 0; scale && (l < 80); l += 2)
+          {
+            __m128d vv = _mm_load_pd(&v[l]);
+            __m128d v1 = _mm_and_pd(vv, absMask.m);
+            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(_mm_movemask_pd( v1 ) != 3)
+              scale = 0;
+          }               
+        }
+
+        if (scale)
+        {
+          gapScaling = 1;
+          __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+          for(l = 0; l < 80; l+=2)
+          {
+            __m128d ex3v = _mm_load_pd(&v[l]);            
+            _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));        
+          }                               
+
+
+        }
+      }
+
+      for (i = 0; i < n; i++)
+      {
+        if(x3_gap[i / 32] & mask32[i % 32])
+        {            
+          if(gapScaling)
+          {     
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];                              
+          }
+        }
+        else
+        {
+          if(x1_gap[i / 32] & mask32[i % 32])
+            x1v = x1_gapColumn;
+          else
+          {
+            x1v = x1_ptr;
+            x1_ptr += 80;
+          }
+
+          if(x2_gap[i / 32] & mask32[i % 32])
+            x2v = x2_gapColumn;
+          else
+          {
+            x2v = x2_ptr;
+            x2_ptr += 80;
+          }
+
+          for(k = 0; k < 4; k++)
+          {
+            vl = &(x1v[20 * k]);
+            vr = &(x2v[20 * k]);
+            v =  &x3_ptr[20 * k];
+
+            __m128d zero =  _mm_setzero_pd();
+            for(l = 0; l < 20; l+=2)                                
+              _mm_store_pd(&v[l], zero);
+
+            for(l = 0; l < 20; l++)
+            {            
+              {
+                __m128d al = _mm_setzero_pd();
+                __m128d ar = _mm_setzero_pd();
+
+                double *ll   = &left[k * 400 + l * 20];
+                double *rr   = &right[k * 400 + l * 20];
+                double *EVEV = &extEV[20 * l];
+
+                for(j = 0; j < 20; j+=2)
+                {
+                  __m128d lv  = _mm_load_pd(&ll[j]);
+                  __m128d rv  = _mm_load_pd(&rr[j]);
+                  __m128d vll = _mm_load_pd(&vl[j]);
+                  __m128d vrr = _mm_load_pd(&vr[j]);
+
+                  al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
+                  ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
+                }                
+
+                al = _mm_hadd_pd(al, al);
+                ar = _mm_hadd_pd(ar, ar);
+
+                al = _mm_mul_pd(al, ar);
+
+                for(j = 0; j < 20; j+=2)
+                {
+                  __m128d vv  = _mm_load_pd(&v[j]);
+                  __m128d EVV = _mm_load_pd(&EVEV[j]);
+
+                  vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+
+                  _mm_store_pd(&v[j], vv);
+                }                                                 
+              }          
+
+            }
+          }
+
+
+
+          { 
+            v = x3_ptr;
+            __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+            scale = 1;
+            for(l = 0; scale && (l < 80); l += 2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              __m128d v1 = _mm_and_pd(vv, absMask.m);
+              v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+              if(_mm_movemask_pd( v1 ) != 3)
+                scale = 0;
+            }             
+          }
+
+
+          if (scale)
+          {
+            __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+            for(l = 0; l < 80; l+=2)
+            {
+              __m128d ex3v = _mm_load_pd(&v[l]);                  
+              _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));      
+            }                             
+
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];                         
+          }
+          x3_ptr += 80;
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  if(fastScaling)
+    *scalerIncrement = addScale;  
+}
+
+
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR GAMMA (Optimized SSE3 version for AA data)
+
+    This is the SSE3 optimized version of ::newviewGAMMA_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b GAMMA
+    model of rate heterogeneity.
+
+    @note
+    For more details and function argument description check the function ::newviewGAMMA_FLEX
+*/
+static void newviewGTRGAMMAPROT(int tipCase,
+                                double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                                int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling)
+{
+  double  *uX1, *uX2, *v;
+  double x1px2;
+  int  i, j, l, k, scale, addScale = 0;
+  double *vl, *vr;
+
+
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        double umpX1[1840], umpX2[1840];
+
+        for(i = 0; i < 23; i++)
+        {
+          v = &(tipVector[20 * i]);
+
+          for(k = 0; k < 80; k++)
+          {
+            double *ll =  &left[k * 20];
+            double *rr =  &right[k * 20];
+
+            __m128d umpX1v = _mm_setzero_pd();
+            __m128d umpX2v = _mm_setzero_pd();
+
+            for(l = 0; l < 20; l+=2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
+              umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));                                 
+            }
+
+            umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
+            umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
+
+            _mm_storel_pd(&umpX1[80 * i + k], umpX1v);
+            _mm_storel_pd(&umpX2[80 * i + k], umpX2v);
+
+          }
+        }
+
+        for(i = 0; i < n; i++)
+        {
+          uX1 = &umpX1[80 * tipX1[i]];
+          uX2 = &umpX2[80 * tipX2[i]];
+
+          for(j = 0; j < 4; j++)
+          {
+            v = &x3[i * 80 + j * 20];
+
+
+            __m128d zero =  _mm_setzero_pd();
+            for(k = 0; k < 20; k+=2)                                
+              _mm_store_pd(&v[k], zero);
+
+            for(k = 0; k < 20; k++)
+            { 
+              double *eev = &extEV[k * 20];
+              x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+              __m128d x1px2v = _mm_set1_pd(x1px2);
+
+              for(l = 0; l < 20; l+=2)
+              {
+                __m128d vv = _mm_load_pd(&v[l]);
+                __m128d ee = _mm_load_pd(&eev[l]);
+
+                vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+
+                _mm_store_pd(&v[l], vv);
+              }
+            }
+
+
+          }        
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        double umpX1[1840], ump_x2[20];
+
+
+        for(i = 0; i < 23; i++)
+        {
+          v = &(tipVector[20 * i]);
+
+          for(k = 0; k < 80; k++)
+          {
+            double *ll =  &left[k * 20];
+
+            __m128d umpX1v = _mm_setzero_pd();
+
+            for(l = 0; l < 20; l+=2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));                                                 
+            }
+
+            umpX1v = _mm_hadd_pd(umpX1v, umpX1v);                               
+            _mm_storel_pd(&umpX1[80 * i + k], umpX1v);          
+
+
+          }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+          uX1 = &umpX1[80 * tipX1[i]];
+
+          for(k = 0; k < 4; k++)
+          {
+            v = &(x2[80 * i + k * 20]);
+
+            for(l = 0; l < 20; l++)
+            {              
+              double *r =  &right[k * 400 + l * 20];
+              __m128d ump_x2v = _mm_setzero_pd();           
+
+              for(j = 0; j < 20; j+= 2)
+              {
+                __m128d vv = _mm_load_pd(&v[j]);
+                __m128d rr = _mm_load_pd(&r[j]);
+                ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
+              }
+
+              ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+
+              _mm_storel_pd(&ump_x2[l], ump_x2v);                                    
+            }
+
+            v = &(x3[80 * i + 20 * k]);
+
+            __m128d zero =  _mm_setzero_pd();
+            for(l = 0; l < 20; l+=2)                                
+              _mm_store_pd(&v[l], zero);
+
+            for(l = 0; l < 20; l++)
+            {
+              double *eev = &extEV[l * 20];
+              x1px2 = uX1[k * 20 + l]  * ump_x2[l];
+              __m128d x1px2v = _mm_set1_pd(x1px2);
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d vv = _mm_load_pd(&v[j]);
+                __m128d ee = _mm_load_pd(&eev[j]);
+
+                vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+
+                _mm_store_pd(&v[j], vv);
+              }                             
+            }                   
+
+          }
+
+
+          { 
+            v = &(x3[80 * i]);
+            __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+            scale = 1;
+            for(l = 0; scale && (l < 80); l += 2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              __m128d v1 = _mm_and_pd(vv, absMask.m);
+              v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+              if(_mm_movemask_pd( v1 ) != 3)
+                scale = 0;
+            }             
+          }
+
+
+          if (scale)
+          {
+
+            __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+            for(l = 0; l < 80; l+=2)
+            {
+              __m128d ex3v = _mm_load_pd(&v[l]);                  
+              _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));      
+            }                             
+
+
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];
+
+          }
+        }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        for(k = 0; k < 4; k++)
+        {
+          vl = &(x1[80 * i + 20 * k]);
+          vr = &(x2[80 * i + 20 * k]);
+          v =  &(x3[80 * i + 20 * k]);
+
+
+          __m128d zero =  _mm_setzero_pd();
+          for(l = 0; l < 20; l+=2)                                  
+            _mm_store_pd(&v[l], zero);
+
+
+          for(l = 0; l < 20; l++)
+          {              
+
+            {
+              __m128d al = _mm_setzero_pd();
+              __m128d ar = _mm_setzero_pd();
+
+              double *ll   = &left[k * 400 + l * 20];
+              double *rr   = &right[k * 400 + l * 20];
+              double *EVEV = &extEV[20 * l];
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d lv  = _mm_load_pd(&ll[j]);
+                __m128d rv  = _mm_load_pd(&rr[j]);
+                __m128d vll = _mm_load_pd(&vl[j]);
+                __m128d vrr = _mm_load_pd(&vr[j]);
+
+                al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
+                ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
+              }                  
+
+              al = _mm_hadd_pd(al, al);
+              ar = _mm_hadd_pd(ar, ar);
+
+              al = _mm_mul_pd(al, ar);
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d vv  = _mm_load_pd(&v[j]);
+                __m128d EVV = _mm_load_pd(&EVEV[j]);
+
+                vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+
+                _mm_store_pd(&v[j], vv);
+              }                                           
+            }            
+
+          }
+        }
+
+
+
+        { 
+          v = &(x3[80 * i]);
+          __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+          scale = 1;
+          for(l = 0; scale && (l < 80); l += 2)
+          {
+            __m128d vv = _mm_load_pd(&v[l]);
+            __m128d v1 = _mm_and_pd(vv, absMask.m);
+            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(_mm_movemask_pd( v1 ) != 3)
+              scale = 0;
+          }               
+        }
+
+
+        if (scale)
+        {
+
+          __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+          for(l = 0; l < 80; l+=2)
+          {
+            __m128d ex3v = _mm_load_pd(&v[l]);            
+            _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));        
+          }                               
+
+
+          if(!fastScaling)
+            ex3[i] += 1;
+          else
+            addScale += wgt[i];
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR CAT (Optimized SSE3 version for AA data)
+
+    This is the SSE3 optimized version of ::newviewCAT_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b CAT
+    model of rate heterogeneity.
+
+    @note
+    For more details and function argument description check the function ::newviewCAT_FLEX
+*/
+static void newviewGTRCATPROT(int tipCase, double *extEV,
+                              int *cptr,
+                              double *x1, double *x2, double *x3, double *tipVector,
+                              int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                              int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling)
+{
+  double
+    *le, *ri, *v, *vl, *vr;
+
+  int i, l, j, scale, addScale = 0;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        for (i = 0; i < n; i++)
+        {
+          le = &left[cptr[i] * 400];
+          ri = &right[cptr[i] * 400];
+
+          vl = &(tipVector[20 * tipX1[i]]);
+          vr = &(tipVector[20 * tipX2[i]]);
+          v  = &x3[20 * i];
+
+          for(l = 0; l < 20; l+=2)
+            _mm_store_pd(&v[l], _mm_setzero_pd());                      
+
+
+          for(l = 0; l < 20; l++)
+          {
+            __m128d x1v = _mm_setzero_pd();
+            __m128d x2v = _mm_setzero_pd();      
+            double 
+              *ev = &extEV[l * 20],
+              *lv = &le[l * 20],
+              *rv = &ri[l * 20];
+
+            for(j = 0; j < 20; j+=2)
+            {
+              x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));                  
+              x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+            }
+
+            x1v = _mm_hadd_pd(x1v, x1v);
+            x2v = _mm_hadd_pd(x2v, x2v);
+
+            x1v = _mm_mul_pd(x1v, x2v);
+
+            for(j = 0; j < 20; j+=2)
+            {
+              __m128d vv = _mm_load_pd(&v[j]);
+              vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
+              _mm_store_pd(&v[j], vv);
+            }               
+
+          }        
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        for (i = 0; i < n; i++)
+        {
+          le = &left[cptr[i] * 400];
+          ri = &right[cptr[i] * 400];
+
+          vl = &(tipVector[20 * tipX1[i]]);
+          vr = &x2[20 * i];
+          v  = &x3[20 * i];
+
+          for(l = 0; l < 20; l+=2)
+            _mm_store_pd(&v[l], _mm_setzero_pd());                      
+
+
+
+          for(l = 0; l < 20; l++)
+          {
+
+            __m128d x1v = _mm_setzero_pd();
+            __m128d x2v = _mm_setzero_pd();     
+            double 
+              *ev = &extEV[l * 20],
+              *lv = &le[l * 20],
+              *rv = &ri[l * 20];
+
+            for(j = 0; j < 20; j+=2)
+            {
+              x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));                  
+              x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+            }
+
+            x1v = _mm_hadd_pd(x1v, x1v);
+            x2v = _mm_hadd_pd(x2v, x2v);
+
+            x1v = _mm_mul_pd(x1v, x2v);
+
+            for(j = 0; j < 20; j+=2)
+            {
+              __m128d vv = _mm_load_pd(&v[j]);
+              vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
+              _mm_store_pd(&v[j], vv);
+            }               
+
+          }
+
+          {         
+            __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+            scale = 1;
+            for(l = 0; scale && (l < 20); l += 2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              __m128d v1 = _mm_and_pd(vv, absMask.m);
+              v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+              if(_mm_movemask_pd( v1 ) != 3)
+                scale = 0;
+            }             
+          }
+
+
+          if(scale)
+          {
+
+            __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+            for(l = 0; l < 20; l+=2)
+            {
+              __m128d ex3v = _mm_load_pd(&v[l]);
+              _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));                  
+            }
+
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];         
+          }
+        }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+      {
+        le = &left[cptr[i] * 400];
+        ri = &right[cptr[i] * 400];
+
+        vl = &x1[20 * i];
+        vr = &x2[20 * i];
+        v = &x3[20 * i];
+
+
+        for(l = 0; l < 20; l+=2)
+          _mm_store_pd(&v[l], _mm_setzero_pd());                        
+
+
+        for(l = 0; l < 20; l++)
+        {
+
+          __m128d x1v = _mm_setzero_pd();
+          __m128d x2v = _mm_setzero_pd();
+          double 
+            *ev = &extEV[l * 20],
+            *lv = &le[l * 20],
+            *rv = &ri[l * 20];
+
+
+          for(j = 0; j < 20; j+=2)
+          {
+            x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));                    
+            x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+          }
+
+          x1v = _mm_hadd_pd(x1v, x1v);
+          x2v = _mm_hadd_pd(x2v, x2v);
+
+          x1v = _mm_mul_pd(x1v, x2v);
+
+          for(j = 0; j < 20; j+=2)
+          {
+            __m128d vv = _mm_load_pd(&v[j]);
+            vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
+            _mm_store_pd(&v[j], vv);
+          }                 
+
+        }
+
+        {           
+          __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+          scale = 1;
+          for(l = 0; scale && (l < 20); l += 2)
+          {
+            __m128d vv = _mm_load_pd(&v[l]);
+            __m128d v1 = _mm_and_pd(vv, absMask.m);
+            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(_mm_movemask_pd( v1 ) != 3)
+              scale = 0;
+          }               
+        }
+
+
+        if(scale)
+        {
+
+          __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+          for(l = 0; l < 20; l+=2)
+          {
+            __m128d ex3v = _mm_load_pd(&v[l]);            
+            _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));        
+          }                               
+
+
+          if(!fastScaling)
+            ex3[i] += 1;
+          else
+            addScale += wgt[i];    
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR CAT with memory saving (Optimized SSE3 version for AA data)
+
+    This is the SSE3 optimized version of ::newviewCAT_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b CAT
+    model of rate heterogeneity.
+
+    @note
+    For more details and function argument description check the function ::newviewCAT_FLEX
+*/
+static void newviewGTRCATPROT_SAVE(int tipCase, double *extEV,
+                                   int *cptr,
+                                   double *x1, double *x2, double *x3, double *tipVector,
+                                   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                   int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                   unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                   double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats)
+{
+  double
+    *le, 
+    *ri, 
+    *v, 
+    *vl, 
+    *vr,
+    *x1_ptr = x1,
+    *x2_ptr = x2, 
+    *x3_ptr = x3;
+
+  int 
+    i, 
+    l, 
+    j, 
+    scale, 
+    scaleGap = 0,
+    addScale = 0;
+
+  {
+    vl = x1_gapColumn;        
+    vr = x2_gapColumn;
+    v = x3_gapColumn;
+
+    le = &left[maxCats * 400];
+    ri = &right[maxCats * 400];   
+
+    for(l = 0; l < 20; l+=2)
+      _mm_store_pd(&v[l], _mm_setzero_pd());                    
+
+    for(l = 0; l < 20; l++)
+    {
+      __m128d x1v = _mm_setzero_pd();
+      __m128d x2v = _mm_setzero_pd();
+      double 
+        *ev = &extEV[l * 20],
+        *lv = &le[l * 20],
+        *rv = &ri[l * 20];
+
+
+      for(j = 0; j < 20; j+=2)
+      {
+        x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));                
+        x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+      }
+
+      x1v = _mm_hadd_pd(x1v, x1v);
+      x2v = _mm_hadd_pd(x2v, x2v);
+
+      x1v = _mm_mul_pd(x1v, x2v);
+
+      for(j = 0; j < 20; j+=2)
+      {
+        __m128d vv = _mm_load_pd(&v[j]);
+        vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
+        _mm_store_pd(&v[j], vv);
+      }                 
+    }
+
+    if(tipCase != PLL_TIP_TIP)
+    {       
+      __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+      scale = 1;
+      for(l = 0; scale && (l < 20); l += 2)
+      {
+        __m128d vv = _mm_load_pd(&v[l]);
+        __m128d v1 = _mm_and_pd(vv, absMask.m);
+        v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+        if(_mm_movemask_pd( v1 ) != 3)
+          scale = 0;
+      }                 
+
+      if(scale)
+      {
+        __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d ex3v = _mm_load_pd(&v[l]);              
+          _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));  
+        }                                 
+
+        scaleGap = PLL_TRUE;       
+      }
+    }
+  }
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        for (i = 0; i < n; i++)
+        {
+          if(noGap(x3_gap, i))
+          {             
+            vl = &(tipVector[20 * tipX1[i]]);
+            vr = &(tipVector[20 * tipX2[i]]);
+            v  = x3_ptr;
+
+            if(isGap(x1_gap, i))
+              le =  &left[maxCats * 400];
+            else                  
+              le =  &left[cptr[i] * 400];         
+
+            if(isGap(x2_gap, i))
+              ri =  &right[maxCats * 400];
+            else                  
+              ri =  &right[cptr[i] * 400];
+
+            for(l = 0; l < 20; l+=2)
+              _mm_store_pd(&v[l], _mm_setzero_pd());                    
+
+            for(l = 0; l < 20; l++)
+            {
+              __m128d x1v = _mm_setzero_pd();
+              __m128d x2v = _mm_setzero_pd();    
+              double 
+                *ev = &extEV[l * 20],
+                *lv = &le[l * 20],
+                *rv = &ri[l * 20];
+
+              for(j = 0; j < 20; j+=2)
+              {
+                x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));                
+                x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+              }
+
+              x1v = _mm_hadd_pd(x1v, x1v);
+              x2v = _mm_hadd_pd(x2v, x2v);
+
+              x1v = _mm_mul_pd(x1v, x2v);
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d vv = _mm_load_pd(&v[j]);
+                vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
+                _mm_store_pd(&v[j], vv);
+              }            
+            }
+
+            x3_ptr += 20;
+
+          }   
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        for (i = 0; i < n; i++)
+        {
+          if(isGap(x3_gap, i))
+          {
+            if(scaleGap)
+              {
+                if(!fastScaling)
+                  ex3[i] += 1;
+                else
+                  addScale += wgt[i];
+              }
+          }
+          else
+          {      
+            vl = &(tipVector[20 * tipX1[i]]);
+
+            vr = x2_ptr;
+            v = x3_ptr;
+
+            if(isGap(x1_gap, i))
+              le =  &left[maxCats * 400];
+            else
+              le =  &left[cptr[i] * 400];
+
+            if(isGap(x2_gap, i))
+            {            
+              ri =  &right[maxCats * 400];
+              vr = x2_gapColumn;
+            }
+            else
+            {
+              ri =  &right[cptr[i] * 400];
+              vr = x2_ptr;
+              x2_ptr += 20;
+            }                                             
+
+            for(l = 0; l < 20; l+=2)
+              _mm_store_pd(&v[l], _mm_setzero_pd());                               
+
+            for(l = 0; l < 20; l++)
+            {
+              __m128d x1v = _mm_setzero_pd();
+              __m128d x2v = _mm_setzero_pd();   
+              double 
+                *ev = &extEV[l * 20],
+                *lv = &le[l * 20],
+                *rv = &ri[l * 20];
+
+              for(j = 0; j < 20; j+=2)
+              {
+                x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));                
+                x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+              }
+
+              x1v = _mm_hadd_pd(x1v, x1v);
+              x2v = _mm_hadd_pd(x2v, x2v);
+
+              x1v = _mm_mul_pd(x1v, x2v);
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d vv = _mm_load_pd(&v[j]);
+                vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
+                _mm_store_pd(&v[j], vv);
+              }             
+            }
+
+            {       
+              __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+              scale = 1;
+              for(l = 0; scale && (l < 20); l += 2)
+              {
+                __m128d vv = _mm_load_pd(&v[l]);
+                __m128d v1 = _mm_and_pd(vv, absMask.m);
+                v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+                if(_mm_movemask_pd( v1 ) != 3)
+                  scale = 0;
+              }           
+            }
+
+
+            if(scale)
+            {
+              __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+              for(l = 0; l < 20; l+=2)
+              {
+                __m128d ex3v = _mm_load_pd(&v[l]);
+                _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));                
+              }
+              
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];       
+            }
+            x3_ptr += 20;
+          }
+        }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+      { 
+        if(isGap(x3_gap, i))
+        {
+          if(scaleGap)
+            {
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];
+            }
+        }
+        else
+        {                    
+          v = x3_ptr;
+
+          if(isGap(x1_gap, i))
+          {
+            vl = x1_gapColumn;
+            le =  &left[maxCats * 400];
+          }
+          else
+          {
+            le =  &left[cptr[i] * 400];
+            vl = x1_ptr;
+            x1_ptr += 20;
+          }
+
+          if(isGap(x2_gap, i))  
+          {
+            vr = x2_gapColumn;
+            ri =  &right[maxCats * 400];            
+          }
+          else
+          {
+            ri =  &right[cptr[i] * 400];
+            vr = x2_ptr;
+            x2_ptr += 20;
+          }                               
+
+          for(l = 0; l < 20; l+=2)
+            _mm_store_pd(&v[l], _mm_setzero_pd());                      
+
+          for(l = 0; l < 20; l++)
+          {
+            __m128d x1v = _mm_setzero_pd();
+            __m128d x2v = _mm_setzero_pd();
+            double 
+              *ev = &extEV[l * 20],
+              *lv = &le[l * 20],
+              *rv = &ri[l * 20];
+
+            for(j = 0; j < 20; j+=2)
+            {
+              x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));                  
+              x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+            }
+
+            x1v = _mm_hadd_pd(x1v, x1v);
+            x2v = _mm_hadd_pd(x2v, x2v);
+
+            x1v = _mm_mul_pd(x1v, x2v);
+
+            for(j = 0; j < 20; j+=2)
+            {
+              __m128d vv = _mm_load_pd(&v[j]);
+              vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
+              _mm_store_pd(&v[j], vv);
+            }               
+
+          }
+
+          {         
+            __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+            scale = 1;
+            for(l = 0; scale && (l < 20); l += 2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              __m128d v1 = _mm_and_pd(vv, absMask.m);
+              v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+              if(_mm_movemask_pd( v1 ) != 3)
+                scale = 0;
+            }             
+          }
+
+          if(scale)
+          {
+            __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+            for(l = 0; l < 20; l+=2)
+            {
+              __m128d ex3v = _mm_load_pd(&v[l]);                  
+              _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));      
+            }                             
+
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];          
+          }
+          x3_ptr += 20;
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for the GTR GAMMA and for the LG4 model (Optimized SSE3 version for AA data)
+
+    This is the SSE3 optimized version of ::newviewGAMMA_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b GAMMA
+    model of rate heterogeneity and the LG4 model of evolution. Note that the original unoptimized
+    function does not incorporate the LG4 model.
+
+    @note
+    For more details and function argument description check the function ::newviewGAMMA_FLEX
+*/
+static void newviewGTRGAMMAPROT_LG4(int tipCase,
+                                    double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
+                                    int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                    int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling)
+{
+  double  *uX1, *uX2, *v;
+  double x1px2;
+  int  i, j, l, k, scale, addScale = 0;
+  double *vl, *vr;
+#ifndef __SSE3
+  double al, ar;
+#endif
+
+
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+        double umpX1[1840], umpX2[1840];
+
+        for(i = 0; i < 23; i++)
+          {
+           
+
+            for(k = 0; k < 80; k++)
+              {
+                
+                v = &(tipVector[k / 20][20 * i]);
+#ifdef __SSE3
+                double *ll =  &left[k * 20];
+                double *rr =  &right[k * 20];
+                
+                __m128d umpX1v = _mm_setzero_pd();
+                __m128d umpX2v = _mm_setzero_pd();
+
+                for(l = 0; l < 20; l+=2)
+                  {
+                    __m128d vv = _mm_load_pd(&v[l]);
+                    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
+                    umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));                                   
+                  }
+                
+                umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
+                umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
+                
+                _mm_storel_pd(&umpX1[80 * i + k], umpX1v);
+                _mm_storel_pd(&umpX2[80 * i + k], umpX2v);
+#else
+                umpX1[80 * i + k] = 0.0;
+                umpX2[80 * i + k] = 0.0;
+
+                for(l = 0; l < 20; l++)
+                  {
+                    umpX1[80 * i + k] +=  v[l] *  left[k * 20 + l];
+                    umpX2[80 * i + k] +=  v[l] * right[k * 20 + l];
+                  }
+#endif
+              }
+          }
+
+        for(i = 0; i < n; i++)
+          {
+            uX1 = &umpX1[80 * tipX1[i]];
+            uX2 = &umpX2[80 * tipX2[i]];
+
+            for(j = 0; j < 4; j++)
+              {
+                v = &x3[i * 80 + j * 20];
+
+#ifdef __SSE3
+                __m128d zero =  _mm_setzero_pd();
+                for(k = 0; k < 20; k+=2)                                    
+                  _mm_store_pd(&v[k], zero);
+
+                for(k = 0; k < 20; k++)
+                  { 
+                    double *eev = &extEV[j][k * 20];
+                    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+                    __m128d x1px2v = _mm_set1_pd(x1px2);
+
+                    for(l = 0; l < 20; l+=2)
+                      {
+                        __m128d vv = _mm_load_pd(&v[l]);
+                        __m128d ee = _mm_load_pd(&eev[l]);
+
+                        vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+                        
+                        _mm_store_pd(&v[l], vv);
+                      }
+                  }
+
+#else
+
+                for(k = 0; k < 20; k++)
+                  v[k] = 0.0;
+
+                for(k = 0; k < 20; k++)
+                  {                
+                    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+                   
+                    for(l = 0; l < 20; l++)                                                     
+                      v[l] += x1px2 * extEV[j][20 * k + l];                  
+                  }
+#endif
+              }    
+          }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        double umpX1[1840], ump_x2[20];
+
+
+        for(i = 0; i < 23; i++)
+          {
+           
+
+            for(k = 0; k < 80; k++)
+              { 
+                v = &(tipVector[k / 20][20 * i]);
+#ifdef __SSE3
+                double *ll =  &left[k * 20];
+                                
+                __m128d umpX1v = _mm_setzero_pd();
+                
+                for(l = 0; l < 20; l+=2)
+                  {
+                    __m128d vv = _mm_load_pd(&v[l]);
+                    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));                                                   
+                  }
+                
+                umpX1v = _mm_hadd_pd(umpX1v, umpX1v);                           
+                _mm_storel_pd(&umpX1[80 * i + k], umpX1v);              
+#else       
+                umpX1[80 * i + k] = 0.0;
+
+                for(l = 0; l < 20; l++)
+                  umpX1[80 * i + k] +=  v[l] * left[k * 20 + l];
+#endif
+
+              }
+          }
+
+        for (i = 0; i < n; i++)
+          {
+            uX1 = &umpX1[80 * tipX1[i]];
+
+            for(k = 0; k < 4; k++)
+              {
+                v = &(x2[80 * i + k * 20]);
+#ifdef __SSE3              
+                for(l = 0; l < 20; l++)
+                  {                
+                    double *r =  &right[k * 400 + l * 20];
+                    __m128d ump_x2v = _mm_setzero_pd();     
+                    
+                    for(j = 0; j < 20; j+= 2)
+                      {
+                        __m128d vv = _mm_load_pd(&v[j]);
+                        __m128d rr = _mm_load_pd(&r[j]);
+                        ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
+                      }
+                     
+                    ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+                    
+                    _mm_storel_pd(&ump_x2[l], ump_x2v);                              
+                  }
+
+                v = &(x3[80 * i + 20 * k]);
+
+                __m128d zero =  _mm_setzero_pd();
+                for(l = 0; l < 20; l+=2)                                    
+                  _mm_store_pd(&v[l], zero);
+                  
+                for(l = 0; l < 20; l++)
+                  {
+                    double *eev = &extEV[k][l * 20];
+                    x1px2 = uX1[k * 20 + l]  * ump_x2[l];
+                    __m128d x1px2v = _mm_set1_pd(x1px2);
+                  
+                    for(j = 0; j < 20; j+=2)
+                      {
+                        __m128d vv = _mm_load_pd(&v[j]);
+                        __m128d ee = _mm_load_pd(&eev[j]);
+                        
+                        vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+                        
+                        _mm_store_pd(&v[j], vv);
+                      }                             
+                  }                     
+#else
+                for(l = 0; l < 20; l++)
+                  {
+                    ump_x2[l] = 0.0;
+
+                    for(j = 0; j < 20; j++)
+                      ump_x2[l] += v[j] * right[k * 400 + l * 20 + j];
+                  }
+
+                v = &(x3[80 * i + 20 * k]);
+
+                for(l = 0; l < 20; l++)
+                  v[l] = 0;
+
+                for(l = 0; l < 20; l++)
+                  {
+                    x1px2 = uX1[k * 20 + l]  * ump_x2[l];
+                    for(j = 0; j < 20; j++)
+                      v[j] += x1px2 * extEV[k][l * 20  + j];
+                  }
+#endif
+              }
+           
+#ifdef __SSE3
+            { 
+              v = &(x3[80 * i]);
+              __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+              
+              scale = 1;
+              for(l = 0; scale && (l < 80); l += 2)
+                {
+                  __m128d vv = _mm_load_pd(&v[l]);
+                  __m128d v1 = _mm_and_pd(vv, absMask.m);
+                  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+                  if(_mm_movemask_pd( v1 ) != 3)
+                    scale = 0;
+                }                 
+            }
+#else
+            v = &x3[80 * i];
+            scale = 1;
+            for(l = 0; scale && (l < 80); l++)
+              scale = (PLL_ABS(v[l]) <  PLL_MINLIKELIHOOD );
+#endif
+
+            if (scale)
+              {
+#ifdef __SSE3
+               __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+               
+               for(l = 0; l < 80; l+=2)
+                 {
+                   __m128d ex3v = _mm_load_pd(&v[l]);             
+                   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); 
+                 }                                
+#else
+                for(l = 0; l < 80; l++)
+                  v[l] *= PLL_TWOTOTHE256;
+#endif
+
+                if(useFastScaling)
+                  addScale += wgt[i];
+                else
+                  ex3[i]  += 1;        
+              }
+          }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+       {
+         for(k = 0; k < 4; k++)
+           {
+             vl = &(x1[80 * i + 20 * k]);
+             vr = &(x2[80 * i + 20 * k]);
+             v =  &(x3[80 * i + 20 * k]);
+
+#ifdef __SSE3
+             __m128d zero =  _mm_setzero_pd();
+             for(l = 0; l < 20; l+=2)                               
+               _mm_store_pd(&v[l], zero);
+#else
+             for(l = 0; l < 20; l++)
+               v[l] = 0;
+#endif
+
+             for(l = 0; l < 20; l++)
+               {                 
+#ifdef __SSE3
+                 {
+                   __m128d al = _mm_setzero_pd();
+                   __m128d ar = _mm_setzero_pd();
+
+                   double *ll   = &left[k * 400 + l * 20];
+                   double *rr   = &right[k * 400 + l * 20];
+                   double *EVEV = &extEV[k][20 * l];
+                   
+                   for(j = 0; j < 20; j+=2)
+                     {
+                       __m128d lv  = _mm_load_pd(&ll[j]);
+                       __m128d rv  = _mm_load_pd(&rr[j]);
+                       __m128d vll = _mm_load_pd(&vl[j]);
+                       __m128d vrr = _mm_load_pd(&vr[j]);
+                       
+                       al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
+                       ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
+                     }                   
+                       
+                   al = _mm_hadd_pd(al, al);
+                   ar = _mm_hadd_pd(ar, ar);
+                   
+                   al = _mm_mul_pd(al, ar);
+
+                   for(j = 0; j < 20; j+=2)
+                     {
+                       __m128d vv  = _mm_load_pd(&v[j]);
+                       __m128d EVV = _mm_load_pd(&EVEV[j]);
+
+                       vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+
+                       _mm_store_pd(&v[j], vv);
+                     }                                            
+                 }               
+#else
+                 al = 0.0;
+                 ar = 0.0;
+
+                 for(j = 0; j < 20; j++)
+                   {
+                     al += vl[j] * left[k * 400 + l * 20 + j];
+                     ar += vr[j] * right[k * 400 + l * 20 + j];
+                   }
+
+                 x1px2 = al * ar;
+
+                 for(j = 0; j < 20; j++)
+                   v[j] += x1px2 * extEV[k][20 * l + j];
+#endif
+               }
+           }
+         
+
+#ifdef __SSE3
+         { 
+           v = &(x3[80 * i]);
+           __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+           
+           scale = 1;
+           for(l = 0; scale && (l < 80); l += 2)
+             {
+               __m128d vv = _mm_load_pd(&v[l]);
+               __m128d v1 = _mm_and_pd(vv, absMask.m);
+               v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+               if(_mm_movemask_pd( v1 ) != 3)
+                 scale = 0;
+             }            
+         }
+#else
+         v = &(x3[80 * i]);
+         scale = 1;
+         for(l = 0; scale && (l < 80); l++)
+           scale = ((PLL_ABS(v[l]) <  PLL_MINLIKELIHOOD ));
+#endif
+
+         if (scale)
+           {
+#ifdef __SSE3
+               __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+               
+               for(l = 0; l < 80; l+=2)
+                 {
+                   __m128d ex3v = _mm_load_pd(&v[l]);             
+                   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); 
+                 }                                
+#else        
+             for(l = 0; l < 80; l++)
+               v[l] *= PLL_TWOTOTHE256;
+#endif
+
+             if(useFastScaling)
+               addScale += wgt[i];
+             else
+               ex3[i]  += 1;      
+           }
+       }
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+
+}
+#endif
+
+
diff --git a/pll/optimizeModel.c b/pll/optimizeModel.c
new file mode 100644
index 0000000..b4e8902
--- /dev/null
+++ b/pll/optimizeModel.c
@@ -0,0 +1,3149 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file optimizeModel.c
+ *
+ * @brief Model optimization routines
+ */ 
+
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+static const double MNBRAK_GOLD =    1.618034;          /**< Golden ratio */
+static const double MNBRAK_TINY =      1.e-20;
+static const double MNBRAK_GLIMIT =     100.0;
+static const double BRENT_ZEPS  =       1.e-5;
+static const double BRENT_CGOLD =   0.3819660;
+
+extern int optimizeRatesInvocations;
+extern int optimizeAlphaInvocations;
+extern int optimizeInvarInvocations;
+extern char ratesFileName[1024];
+extern char lengthFileName[1024];
+extern char lengthFileNameModel[1024];
+extern char *protModels[PLL_NUM_PROT_MODELS];
+
+static void optParamGeneric(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll, int numberOfModels, int rateNumber, double lim_inf, double lim_sup, int whichParameterType);
+// FLAG for easier debugging of model parameter optimization routines 
+
+//#define _DEBUG_MOD_OPT
+
+
+/*********************FUNCTIONS FOR EXACT MODEL OPTIMIZATION UNDER GTRGAMMA ***************************************/
+
+
+/* the following function is used to set rates in the Q matrix 
+   the data structure called symmetryVector is used to 
+   define the symmetries between rates as they are specified 
+   in some of the secondary structure substitution models that 
+   generally don't use GTR matrices but more restricted forms thereof */
+
+/** @brief Set a specific rate in the substitition matrix
+  *
+  * This function is used to set the \a position-th substitution rate of
+  * partition \a index to \a rate.
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @param model
+  *   Index of partition
+  *
+  * @param rate
+  *   The new value to which to set the specific substition rate
+  *
+  * @param posititon
+  *   The number of the substition rate
+  */
+static void setRateModel(partitionList *pr, int model, double rate, int position)
+{
+  int
+    states   = pr->partitionData[model]->states,
+    numRates = (states * states - states) / 2;
+
+  if(pr->partitionData[model]->dataType == PLL_DNA_DATA)
+    assert(position >= 0 && position < (numRates - 1));
+  else
+    assert(position >= 0 && position < numRates);
+
+  assert(pr->partitionData[model]->dataType != PLL_BINARY_DATA);
+
+  assert(rate >= PLL_RATE_MIN && rate <= PLL_RATE_MAX);
+
+  if(pr->partitionData[model]->nonGTR)
+    {    
+      int 
+        i, 
+        index    = pr->partitionData[model]->symmetryVector[position],
+        lastRate = pr->partitionData[model]->symmetryVector[numRates - 1];
+           
+      for(i = 0; i < numRates; i++)
+        {       
+          if(pr->partitionData[model]->symmetryVector[i] == index)
+            {
+              if(index == lastRate)
+                pr->partitionData[model]->substRates[i] = 1.0;
+              else
+                pr->partitionData[model]->substRates[i] = rate;      
+            }
+          
+          //printf("%f ", tr->partitionData[model].substRates[i]);
+        }
+      //printf("\n");
+    }
+  else
+    pr->partitionData[model]->substRates[position] = rate;
+}
+
+//LIBRARY: the only thing that we will need to do here is to 
+//replace linkList by a string and also add some error correction 
+//code
+
+/* 
+   the following three functions are used to link/unlink parameters 
+   between partitions. This should work in a generic way, however 
+   this is so far mainly used for linking unlinking GTR matrix parameter 
+   estimates across different protein data partitions.
+   Generally this mechanism can also be used for linking/inlinking alpha paremeters 
+   between partitions and the like.
+   However, all alpha parameter estimates for all partitions and GTR estimates for 
+   DNA partitions are unlinked by default. This is actually hard-coded 
+   in here. 
+*/
+
+/* initializwe a parameter linkage list for a certain parameter type (can be whatever).
+   the input is an integer vector that contaions NumberOfModels (numberOfPartitions) elements.
+
+   if we want to have all alpha parameters unlinked and have say 4 partitions the input 
+   vector would look like this: {0, 1, 2, 3}, if we want to link partitions 0 and 3 the vector 
+   should look like this: {0, 1, 2, 0} 
+*/
+
+
+
+
+
+
+/* dedicated helper function to initialize the linkage list, that is, essentiaylly compute 
+   the integer vector int *linkList used above for linking GTR models.
+   
+   Once again, this is hard-coded in RAxML, because users can not influence the linking.
+
+*/
+   
+
+/* free linkage list data structure */
+
+#define ALPHA_F    0
+#define RATE_F     1
+#define FREQ_F     2
+#define LXRATE_F   3
+#define LXWEIGHT_F 4
+
+static void updateWeights(partitionList *pr, int model, int rate, double value)
+{
+    int j;
+    double w = 0.0;
+    assert(rate >= 0 && rate < 4);
+    pr->partitionData[model]->lg4x_weightExponents[rate] = value;
+    for (j = 0; j < 4; j++)
+        w += exp(pr->partitionData[model]->lg4x_weightExponents[j]);
+    for (j = 0; j < 4; j++)
+        pr->partitionData[model]->lg4x_weights[j] = exp(
+                pr->partitionData[model]->lg4x_weightExponents[j]) / w;
+}
+
+static void optimizeWeights(pllInstance *tr, partitionList *pr, double modelEpsilon, linkageList *ll,
+        int numberOfModels)
+{
+    int i;
+    double initialLH = 0.0, finalLH = 0.0;
+    pllEvaluateLikelihood(tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+    initialLH = tr->likelihood;
+    for (i = 0; i < 4; i++)
+        optParamGeneric(tr, pr, modelEpsilon, ll, numberOfModels, i, -1000000.0,
+                200.0, LXWEIGHT_F);
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+    pllMasterBarrier(tr, pr, PLL_THREAD_COPY_LG4X_RATES);
+#endif
+
+    pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+    finalLH = tr->likelihood;
+    if (finalLH < initialLH)
+        printf("Final: %f initial: %f\n", finalLH, initialLH);
+    assert(finalLH >= initialLH);
+}
+
+/** @brief Wrapper function for changing a specific model parameter to the specified value
+  *
+  * Change the \a rateNumber-th model parameter of the type specified by \a whichParameterType to
+  * the value \a value.
+  * This routine is usually called by model optimization routines to restore the original
+  * model parameter vlaue when optimization leads to worse likelihood than the original, or
+  * when optimizing routines and testing the new parameter.
+  * In case of changing a frequency or substitution rate the Q matrix is also decomposed (into
+  * eigenvalues and eigenvectors)
+  *
+  * @param index
+  *   Index of partition
+  *
+  * @param rateNumber
+  *   The index of the model parameter
+  *
+  * @param value
+  *   The value to which the parameter must be changed
+  *
+  * @param whichParameterType
+  *   Type of model parameter. Can be \b RATE_F, \b ALPHA_F or \b FREQ_F, that is substitution rates,
+  *   alpha rates, or base frequencies rates
+  */   
+static void changeModelParameters(int index, int rateNumber, double value, int whichParameterType, pllInstance *tr, partitionList * pr)
+{
+  switch(whichParameterType)
+    {
+    case RATE_F:
+      setRateModel(pr, index, value, rateNumber);  
+      pllInitReversibleGTR(tr, pr, index);          
+      break;
+    case ALPHA_F:
+      pr->partitionData[index]->alpha = value;
+      pllMakeGammaCats(pr->partitionData[index]->alpha, pr->partitionData[index]->gammaRates, 4, tr->useMedian);
+      break;
+    case FREQ_F:
+      {
+        int 
+          states = pr->partitionData[index]->states,
+          j;
+
+        double 
+          w = 0.0;
+
+        pr->partitionData[index]->freqExponents[rateNumber] = value;
+
+        for(j = 0; j < states; j++)
+          w += exp(pr->partitionData[index]->freqExponents[j]);
+
+        for(j = 0; j < states; j++)              
+          pr->partitionData[index]->frequencies[j] = exp(pr->partitionData[index]->freqExponents[j]) / w;
+        
+        pllInitReversibleGTR(tr, pr, index);
+      }
+      break;
+    case LXRATE_F:
+        pr->partitionData[index]->gammaRates[rateNumber] = value;
+        break;
+    case LXWEIGHT_F:
+        updateWeights(pr, index, rateNumber, value);
+        break;
+    default:
+      assert(0);
+    }
+}
+
+/* function that evaluates the change to a parameter */
+/** @brief Evaluate the change of a parameter
+ *
+ *  Evaluate the likelihood for each entry \a i in the linkage list when changing the
+ *  \a rateNumber-th parameter of type \a whichFunction (\b ALPHA_F, \b RATE_F 
+ *  or \b FREQ_F) to \a value[i]. The resulting likelihood for each entry list \a i in the
+ *  linkage list is then stored in \a result[i]
+ *
+ *  @param tr
+ *    PLL instance
+ *
+ *  @param pr
+ *    List of partitions
+ *
+ *  @param rateNumber
+ *    Index of the parameter to optimize 
+ *
+ *  @param value
+ *
+ *  @param result
+ *    An array where the total likelihood of each entry list \a i in the linkage list \a ll  is stored when evaluating the new \a i-th parameter of array \a value
+ *
+ *  @param converged
+ *
+ *  @param whichFunction
+ *    Type of the model parameter. Possible values are \b ALPHA_F, \b RATE_F and \b FREQ_F
+ *
+ *  @param numberOfModels
+ *    Number of partitions for which we are optimizing 
+ *
+ *  @param ll
+ *    Linkage list
+ *
+ *  @param modelEpsilon
+ *    Epsilon threshold
+ */
+static void evaluateChange(pllInstance *tr, partitionList *pr, int rateNumber, double *value, double *result, pllBoolean* converged, int whichFunction, int numberOfModels, linkageList *ll, double modelEpsilon)
+{ 
+  int 
+    i, 
+    k, 
+    pos;
+
+  pllBoolean
+    atLeastOnePartition = PLL_FALSE;
+
+  for(i = 0, pos = 0; i < ll->entries; i++)
+    {
+      if(ll->ld[i].valid)
+        {
+          if(converged[pos])
+            {
+              for(k = 0; k < ll->ld[i].partitions; k++)
+                pr->partitionData[ll->ld[i].partitionList[k]]->executeModel = PLL_FALSE;
+            }
+          else
+            {
+              atLeastOnePartition = PLL_TRUE;
+              for(k = 0; k < ll->ld[i].partitions; k++)
+                {
+                  int 
+                    index = ll->ld[i].partitionList[k];
+
+
+                  changeModelParameters(index, rateNumber, value[pos], whichFunction, tr, pr);
+                }
+            }
+          pos++;
+        }
+      else
+        {
+          for(k = 0; k < ll->ld[i].partitions; k++)
+            pr->partitionData[ll->ld[i].partitionList[k]]->executeModel = PLL_FALSE;
+        }      
+    }
+
+  assert(pos == numberOfModels);
+
+    //some error checks for individual model parameters
+    switch (whichFunction)
+    {
+    case RATE_F:
+        assert(rateNumber != -1);
+        break;
+    case ALPHA_F:
+        break;
+    case LXRATE_F:
+        assert(rateNumber != -1);
+        break;
+    case LXWEIGHT_F:
+        assert(rateNumber != -1);
+        break;
+    case FREQ_F:
+        break;
+    default:
+        assert(0);
+    }
+
+    switch (whichFunction)
+    {
+    case RATE_F:
+    case ALPHA_F:
+    case LXRATE_F:
+    case FREQ_F:
+        pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+        break;
+    case LXWEIGHT_F:
+        pllEvaluateLikelihood(tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+        break;
+    default:
+        assert(0);
+    }
+    //nested optimization for LX4 model, now optimize the weights!
+    if (whichFunction == LXRATE_F && atLeastOnePartition)
+    {
+        pllBoolean *buffer = (pllBoolean*) malloc(
+                pr->numberOfPartitions* sizeof(pllBoolean));
+
+        for (i = 0; i < pr->numberOfPartitions; i++) {
+            buffer[i] = pr->partitionData[i]->executeModel;
+            pr->partitionData[i]->executeModel = PLL_FALSE;
+        }
+
+        for (i = 0, pos = 0; i < ll->entries; i++)
+        {
+            int index = ll->ld[i].partitionList[0];
+            if (ll->ld[i].valid)
+                pr->partitionData[index]->executeModel = PLL_TRUE;
+        }
+        optimizeWeights(tr, pr, modelEpsilon, ll, numberOfModels);
+
+        for (i = 0; i < pr->numberOfPartitions; i++) {
+            pr->partitionData[i]->executeModel = buffer[i];
+        }
+
+        free(buffer);
+    }
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+
+   switch (whichFunction)
+    {
+      case RATE_F:
+        pllMasterBarrier(tr, pr, PLL_THREAD_OPT_RATE);
+        break;
+      case ALPHA_F:
+        pllMasterBarrier(tr, pr, PLL_THREAD_OPT_ALPHA);
+        break;
+      case FREQ_F:
+        pllMasterBarrier(tr, pr, PLL_THREAD_OPT_RATE);
+        break;
+      case LXRATE_F:
+        pllMasterBarrier(tr, pr, PLL_THREAD_OPT_LG4X_RATE);
+        break;
+      case LXWEIGHT_F:
+        pllMasterBarrier(tr, pr, PLL_THREAD_OPT_LG4X_RATE);
+        break;
+      default:
+        break;
+    }
+#else
+   //commented out evaluate below in the course of the LG4X integration
+   //pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+#endif     
+
+
+  for(i = 0, pos = 0; i < ll->entries; i++)     
+    {
+      if(ll->ld[i].valid)
+        {
+          result[pos] = 0.0;
+          
+          for(k = 0; k < ll->ld[i].partitions; k++)
+            {
+              int 
+                index = ll->ld[i].partitionList[k];
+
+              assert(pr->partitionData[index]->partitionLH <= 0.0);
+              result[pos] -= pr->partitionData[index]->partitionLH;
+              
+            }
+          pos++;
+        }
+      for(k = 0; k < ll->ld[i].partitions; k++)
+        {
+          int index = ll->ld[i].partitionList[k];
+          pr->partitionData[index]->executeModel = PLL_TRUE;
+        }         
+    }
+  
+  assert(pos == numberOfModels);   
+}
+
+/* generic implementation of Brent's algorithm for one-dimensional parameter optimization */
+
+/** @brief Brent's algorithm
+ *
+ *  Generic implementation of Brent's algorithm for one-dimensional parameter optimization
+ *
+ *  @param ax
+ *
+ *  @param bx
+ *
+ *  @param cx
+ *
+ *  @param fb
+ *
+ *  @param tol
+ *
+ *  @param xmin
+ *
+ *  @param result
+ *
+ *  @param numberOfModels
+ *    Number of partitions for which we are optimizing 
+ *
+ *  @param whichFunction
+ *    Type of the model parameter. Possible values are \b ALPHA_F, \b RATE_F and \b FREQ_F
+ *
+ *  @param rateNumber
+ *     Index of the parameter to optimize 
+ *   
+ *  @param tr
+ *    PLL instance
+ *
+ *  @param pr
+ *    List of partitions
+ *
+ *  @param ll
+ *    Linkage list
+ *
+ *  @param lim_inf
+ *    Lower bound for the rate assignment
+ *
+ *  @param lim_sup
+ *    Upper bound for the rate assignment
+ *
+ *  @todo
+ *     Fill the rest of the entries. Also, why not preallocate all memory instead of allocating
+ *     at every call? We can save a lot of time which is lost due to function calls, finding free
+ *     memory blocks by allocation strategy, and also prevent mem fragmentation.
+ */
+static void brentGeneric(double *ax, double *bx, double *cx, double *fb, double tol, double *xmin, double *result, int numberOfModels, 
+                         int whichFunction, int rateNumber, pllInstance *tr, partitionList *pr, linkageList *ll, double lim_inf, double lim_sup)
+{
+  int iter, i;
+  double 
+    *a     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *b     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *d     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *etemp = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *fu    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *fv    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *fw    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *fx    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *p     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *q     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *r     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *tol1  = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *tol2  = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *u     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *v     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *w     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *x     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *xm    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *e     = (double *)rax_malloc(sizeof(double) * numberOfModels);
+  pllBoolean *converged = (pllBoolean *)rax_malloc(sizeof(pllBoolean) * numberOfModels);
+  pllBoolean allConverged;
+  
+  for(i = 0; i < numberOfModels; i++)    
+    converged[i] = PLL_FALSE;
+
+  for(i = 0; i < numberOfModels; i++)
+    {
+      e[i] = 0.0;
+      d[i] = 0.0;
+    }
+
+  for(i = 0; i < numberOfModels; i++)
+    {
+      a[i]=((ax[i] < cx[i]) ? ax[i] : cx[i]);
+      b[i]=((ax[i] > cx[i]) ? ax[i] : cx[i]);
+      x[i] = w[i] = v[i] = bx[i];
+      fw[i] = fv[i] = fx[i] = fb[i];
+    }
+
+  for(i = 0; i < numberOfModels; i++)
+    {      
+      assert(a[i] >= lim_inf && a[i] <= lim_sup);
+      assert(b[i] >= lim_inf && b[i] <= lim_sup);
+      assert(x[i] >= lim_inf && x[i] <= lim_sup);
+      assert(v[i] >= lim_inf && v[i] <= lim_sup);
+      assert(w[i] >= lim_inf && w[i] <= lim_sup);
+    }
+  
+  
+
+  for(iter = 1; iter <= PLL_ITMAX; iter++)
+    {
+      allConverged = PLL_TRUE;
+
+      for(i = 0; i < numberOfModels && allConverged; i++)
+        allConverged = allConverged && converged[i];
+
+      if(allConverged)
+        {
+          rax_free(converged);
+          rax_free(a);
+          rax_free(b);
+          rax_free(d);
+          rax_free(etemp);
+          rax_free(fu);
+          rax_free(fv);
+          rax_free(fw);
+          rax_free(fx);
+          rax_free(p);
+          rax_free(q);
+          rax_free(r);
+          rax_free(tol1);
+          rax_free(tol2);
+          rax_free(u);
+          rax_free(v);
+          rax_free(w);
+          rax_free(x);
+          rax_free(xm);
+          rax_free(e);
+          return;
+        }     
+
+      for(i = 0; i < numberOfModels; i++)
+        {
+          if(!converged[i])
+            {                 
+              assert(a[i] >= lim_inf && a[i] <= lim_sup);
+              assert(b[i] >= lim_inf && b[i] <= lim_sup);
+              assert(x[i] >= lim_inf && x[i] <= lim_sup);
+              assert(v[i] >= lim_inf && v[i] <= lim_sup);
+              assert(w[i] >= lim_inf && w[i] <= lim_sup);
+  
+              xm[i] = 0.5 * (a[i] + b[i]);
+              tol2[i] = 2.0 * (tol1[i] = tol * fabs(x[i]) + BRENT_ZEPS);
+          
+              if(fabs(x[i] - xm[i]) <= (tol2[i] - 0.5 * (b[i] - a[i])))
+                {                
+                  result[i] =  -fx[i];
+                  xmin[i]   = x[i];
+                  converged[i] = PLL_TRUE;                
+                }
+              else
+                {
+                  if(fabs(e[i]) > tol1[i])
+                    {                
+                      r[i] = (x[i] - w[i]) * (fx[i] - fv[i]);
+                      q[i] = (x[i] - v[i]) * (fx[i] - fw[i]);
+                      p[i] = (x[i] - v[i]) * q[i] - (x[i] - w[i]) * r[i];
+                      q[i] = 2.0 * (q[i] - r[i]);
+                      if(q[i] > 0.0)
+                        p[i] = -p[i];
+                      q[i] = fabs(q[i]);
+                      etemp[i] = e[i];
+                      e[i] = d[i];
+                      if((fabs(p[i]) >= fabs(0.5 * q[i] * etemp[i])) || (p[i] <= q[i] * (a[i]-x[i])) || (p[i] >= q[i] * (b[i] - x[i])))
+                        d[i] = BRENT_CGOLD * (e[i] = (x[i] >= xm[i] ? a[i] - x[i] : b[i] - x[i]));
+                      else
+                        {
+                          d[i] = p[i] / q[i];
+                          u[i] = x[i] + d[i];
+                          if( u[i] - a[i] < tol2[i] || b[i] - u[i] < tol2[i])
+                            d[i] = PLL_SIGN(tol1[i], xm[i] - x[i]);
+                        }
+                    }
+                  else
+                    {                
+                      d[i] = BRENT_CGOLD * (e[i] = (x[i] >= xm[i] ? a[i] - x[i]: b[i] - x[i]));
+                    }
+                  u[i] = ((fabs(d[i]) >= tol1[i]) ? (x[i] + d[i]) : (x[i] + PLL_SIGN(tol1[i], d[i])));
+                }
+
+              if(!converged[i])
+                assert(u[i] >= lim_inf && u[i] <= lim_sup);
+            }
+        }
+                 
+      evaluateChange(tr, pr, rateNumber, u, fu, converged, whichFunction, numberOfModels, ll, tol);
+
+      for(i = 0; i < numberOfModels; i++)
+        {
+          if(!converged[i])
+            {
+              if(fu[i] <= fx[i])
+                {
+                  if(u[i] >= x[i])
+                    a[i] = x[i];
+                  else
+                    b[i] = x[i];
+                  
+                  PLL_SHFT(v[i],w[i],x[i],u[i]);
+                  PLL_SHFT(fv[i],fw[i],fx[i],fu[i]);
+                }
+              else
+                {
+                  if(u[i] < x[i])
+                    a[i] = u[i];
+                  else
+                    b[i] = u[i];
+                  
+                  if(fu[i] <= fw[i] || w[i] == x[i])
+                    {
+                      v[i] = w[i];
+                      w[i] = u[i];
+                      fv[i] = fw[i];
+                      fw[i] = fu[i];
+                    }
+                  else
+                    {
+                      if(fu[i] <= fv[i] || v[i] == x[i] || v[i] == w[i])
+                        {
+                          v[i] = u[i];
+                          fv[i] = fu[i];
+                        }
+                    }       
+                }
+              
+              assert(a[i] >= lim_inf && a[i] <= lim_sup);
+              assert(b[i] >= lim_inf && b[i] <= lim_sup);
+              assert(x[i] >= lim_inf && x[i] <= lim_sup);
+              assert(v[i] >= lim_inf && v[i] <= lim_sup);
+              assert(w[i] >= lim_inf && w[i] <= lim_sup);
+              assert(u[i] >= lim_inf && u[i] <= lim_sup);
+            }
+        }
+    }
+
+  rax_free(converged);
+  rax_free(a);
+  rax_free(b);
+  rax_free(d);
+  rax_free(etemp);
+  rax_free(fu);
+  rax_free(fv);
+  rax_free(fw);
+  rax_free(fx);
+  rax_free(p);
+  rax_free(q);
+  rax_free(r);
+  rax_free(tol1);
+  rax_free(tol2);
+  rax_free(u);
+  rax_free(v);
+  rax_free(w);
+  rax_free(x);
+  rax_free(xm);
+  rax_free(e);
+
+  printf("\n. Too many iterations in BRENT !");
+  assert(0);
+}
+
+/* generic bracketing function required for Brent's algorithm. For details please see the corresponding chapter in the book Numerical Recipees in C */
+
+/** @brief Bracketing function
+ *
+ *  Generic bracketing function required for Brent's algorithm.
+ *  
+ *  @param param
+ *
+ *  @param ax
+ *
+ *  @param bx
+ *
+ *  @param cx
+ *
+ *  @param fa
+ *
+ *  @param fb
+ *
+ *  @param fc
+ *
+ *  @param lim_inf
+ *    Lower bound for the rate assignment
+ *
+ *  @param lim_sup
+ *    Upper bound for the rate assignment
+ *
+ *  @param numberOfModels
+ *    Number of partitions for which we are optimizing 
+ *
+ *  @param rateNumber
+ *     Index of the parameter to optimize 
+ *
+ *  @param whichFunction
+ *    Type of the model parameter. Possible values are \b ALPHA_F, \b RATE_F and \b FREQ_F
+ *
+ *  @param tr
+ *    PLL instance
+ *
+ *  @param pr
+ *    List of partitions
+ *
+ *  @param ll
+ *    Linkage list
+ *
+ *  @param modelEpsilon
+ *
+ *  @return
+ *    Fill this
+ *
+ *  @todo
+ *    Fill remaining details
+ */
+static int brakGeneric(double *param, double *ax, double *bx, double *cx, double *fa, double *fb, 
+                       double *fc, double lim_inf, double lim_sup, 
+                       int numberOfModels, int rateNumber, int whichFunction, pllInstance *tr, partitionList *pr,
+                       linkageList *ll, double modelEpsilon)
+{
+  double 
+    *ulim = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *u    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *r    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *q    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *fu   = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *dum  = (double *)rax_malloc(sizeof(double) * numberOfModels), 
+    *temp = (double *)rax_malloc(sizeof(double) * numberOfModels);
+  
+  int 
+    i,
+    *state    = (int *)rax_malloc(sizeof(int) * numberOfModels),
+    *endState = (int *)rax_malloc(sizeof(int) * numberOfModels);
+
+  pllBoolean *converged = (pllBoolean *)rax_malloc(sizeof(pllBoolean) * numberOfModels);
+  pllBoolean allConverged;
+
+  for(i = 0; i < numberOfModels; i++)
+    converged[i] = PLL_FALSE;
+
+  for(i = 0; i < numberOfModels; i++)
+    {
+      state[i] = 0;
+      endState[i] = 0;
+
+      u[i] = 0.0;
+
+      param[i] = ax[i];
+
+      if(param[i] > lim_sup)    
+        param[i] = ax[i] = lim_sup;
+      
+      if(param[i] < lim_inf) 
+        param[i] = ax[i] = lim_inf;
+
+      assert(param[i] >= lim_inf && param[i] <= lim_sup);
+    }
+   
+  
+  evaluateChange(tr, pr, rateNumber, param, fa, converged, whichFunction, numberOfModels, ll, modelEpsilon);
+
+
+  for(i = 0; i < numberOfModels; i++)
+    {
+      param[i] = bx[i];
+      if(param[i] > lim_sup) 
+        param[i] = bx[i] = lim_sup;
+      if(param[i] < lim_inf) 
+        param[i] = bx[i] = lim_inf;
+
+      assert(param[i] >= lim_inf && param[i] <= lim_sup);
+    }
+  
+  evaluateChange(tr, pr, rateNumber, param, fb, converged, whichFunction, numberOfModels, ll, modelEpsilon);
+
+  for(i = 0; i < numberOfModels; i++)  
+    {
+      if (fb[i] > fa[i]) 
+        {         
+          PLL_SHFT(dum[i],ax[i],bx[i],dum[i]);
+          PLL_SHFT(dum[i],fa[i],fb[i],dum[i]);
+        }
+      
+      cx[i] = bx[i] + MNBRAK_GOLD * (bx[i] - ax[i]);
+      
+      param[i] = cx[i];
+      
+      if(param[i] > lim_sup) 
+        param[i] = cx[i] = lim_sup;
+      if(param[i] < lim_inf) 
+        param[i] = cx[i] = lim_inf;
+
+      assert(param[i] >= lim_inf && param[i] <= lim_sup);
+    }
+  
+ 
+  evaluateChange(tr, pr, rateNumber, param, fc, converged, whichFunction, numberOfModels,  ll, modelEpsilon);
+
+   while(1) 
+     {       
+       allConverged = PLL_TRUE;
+
+       for(i = 0; i < numberOfModels && allConverged; i++)
+         allConverged = allConverged && converged[i];
+
+       if(allConverged)
+         {
+           for(i = 0; i < numberOfModels; i++)
+             {         
+               if(ax[i] > lim_sup) 
+                 ax[i] = lim_sup;
+               if(ax[i] < lim_inf) 
+                 ax[i] = lim_inf;
+
+               if(bx[i] > lim_sup) 
+                 bx[i] = lim_sup;
+               if(bx[i] < lim_inf) 
+                 bx[i] = lim_inf;
+               
+               if(cx[i] > lim_sup) 
+                 cx[i] = lim_sup;
+               if(cx[i] < lim_inf) 
+                 cx[i] = lim_inf;
+             }
+
+           rax_free(converged);
+           rax_free(ulim);
+           rax_free(u);
+           rax_free(r);
+           rax_free(q);
+           rax_free(fu);
+           rax_free(dum); 
+           rax_free(temp);
+           rax_free(state);   
+           rax_free(endState);
+           return 0;
+           
+         }
+
+       for(i = 0; i < numberOfModels; i++)
+         {
+           if(!converged[i])
+             {
+               switch(state[i])
+                 {
+                 case 0:
+                   endState[i] = 0;
+                   if(!(fb[i] > fc[i]))                  
+                     converged[i] = PLL_TRUE;                                
+                   else
+                     {
+                   
+                       if(ax[i] > lim_sup) 
+                         ax[i] = lim_sup;
+                       if(ax[i] < lim_inf) 
+                         ax[i] = lim_inf;
+                       if(bx[i] > lim_sup) 
+                         bx[i] = lim_sup;
+                       if(bx[i] < lim_inf) 
+                         bx[i] = lim_inf;
+                       if(cx[i] > lim_sup) 
+                         cx[i] = lim_sup;
+                       if(cx[i] < lim_inf) 
+                         cx[i] = lim_inf;
+                       
+                       r[i]=(bx[i]-ax[i])*(fb[i]-fc[i]);
+                       q[i]=(bx[i]-cx[i])*(fb[i]-fa[i]);
+                       u[i]=(bx[i])-((bx[i]-cx[i])*q[i]-(bx[i]-ax[i])*r[i])/
+                         (2.0 * PLL_SIGN(PLL_MAX(fabs(q[i]-r[i]),MNBRAK_TINY),q[i]-r[i]));
+                       
+                       ulim[i]=(bx[i])+MNBRAK_GLIMIT*(cx[i]-bx[i]);
+                       
+                       if(u[i] > lim_sup) 
+                         u[i] = lim_sup;
+                       if(u[i] < lim_inf) 
+                         u[i] = lim_inf;
+                       if(ulim[i] > lim_sup) 
+                         ulim[i] = lim_sup;
+                       if(ulim[i] < lim_inf) 
+                         ulim[i] = lim_inf;
+                       
+                       if ((bx[i]-u[i])*(u[i]-cx[i]) > 0.0)
+                         {
+                           param[i] = u[i];
+                           if(param[i] > lim_sup)                            
+                             param[i] = u[i] = lim_sup;
+                           if(param[i] < lim_inf)
+                             param[i] = u[i] = lim_inf;
+                           endState[i] = 1;
+                         }
+                       else 
+                         {
+                           if ((cx[i]-u[i])*(u[i]-ulim[i]) > 0.0) 
+                             {
+                               param[i] = u[i];
+                               if(param[i] > lim_sup) 
+                                 param[i] = u[i] = lim_sup;
+                               if(param[i] < lim_inf) 
+                                 param[i] = u[i] = lim_inf;
+                               endState[i] = 2;
+                             }                         
+                           else
+                             {
+                               if ((u[i]-ulim[i])*(ulim[i]-cx[i]) >= 0.0) 
+                                 {
+                                   u[i] = ulim[i];
+                                   param[i] = u[i];     
+                                   if(param[i] > lim_sup) 
+                                     param[i] = u[i] = ulim[i] = lim_sup;
+                                   if(param[i] < lim_inf) 
+                                     param[i] = u[i] = ulim[i] = lim_inf;
+                                   endState[i] = 0;
+                                 }                              
+                               else 
+                                 {                
+                                   u[i]=(cx[i])+MNBRAK_GOLD*(cx[i]-bx[i]);
+                                   param[i] = u[i];
+                                   endState[i] = 0;
+                                   if(param[i] > lim_sup) 
+                                     param[i] = u[i] = lim_sup;
+                                   if(param[i] < lim_inf) 
+                                     param[i] = u[i] = lim_inf;
+                                 }
+                             }    
+                         }
+                     }
+                   break;
+                 case 1:
+                   endState[i] = 0;
+                   break;
+                 case 2:
+                   endState[i] = 3;
+                   break;
+                 default:
+                   assert(0);
+                 }
+               assert(param[i] >= lim_inf && param[i] <= lim_sup);
+             }
+         }
+             
+       evaluateChange(tr, pr, rateNumber, param, temp, converged, whichFunction, numberOfModels, ll, modelEpsilon);
+
+       for(i = 0; i < numberOfModels; i++)
+         {
+           if(!converged[i])
+             {         
+               switch(endState[i])
+                 {
+                 case 0:
+                   fu[i] = temp[i];
+                   PLL_SHFT(ax[i],bx[i],cx[i],u[i]);
+                   PLL_SHFT(fa[i],fb[i],fc[i],fu[i]);
+                   state[i] = 0;
+                   break;
+                 case 1:
+                   fu[i] = temp[i];
+                   if (fu[i] < fc[i]) 
+                     {
+                       ax[i]=(bx[i]);
+                       bx[i]=u[i];
+                       fa[i]=(fb[i]);
+                       fb[i]=fu[i]; 
+                       converged[i] = PLL_TRUE;               
+                     } 
+                   else 
+                     {
+                       if (fu[i] > fb[i]) 
+                         {
+                           assert(u[i] >= lim_inf && u[i] <= lim_sup);
+                           cx[i]=u[i];
+                           fc[i]=fu[i];
+                           converged[i] = PLL_TRUE;                       
+                         }
+                       else
+                         {                 
+                           u[i]=(cx[i])+MNBRAK_GOLD*(cx[i]-bx[i]);
+                           param[i] = u[i];
+                           if(param[i] > lim_sup) {param[i] = u[i] = lim_sup;}
+                           if(param[i] < lim_inf) {param[i] = u[i] = lim_inf;}    
+                           state[i] = 1;                 
+                         }                
+                     }
+                   break;
+                 case 2: 
+                   fu[i] = temp[i];
+                   if (fu[i] < fc[i]) 
+                     {               
+                       PLL_SHFT(bx[i],cx[i],u[i], cx[i]+MNBRAK_GOLD*(cx[i]-bx[i]));
+                       state[i] = 2;
+                     }     
+                   else
+                     {
+                       state[i] = 0;
+                       PLL_SHFT(ax[i],bx[i],cx[i],u[i]);
+                       PLL_SHFT(fa[i],fb[i],fc[i],fu[i]);
+                     }
+                   break;          
+                 case 3:                  
+                   PLL_SHFT(fb[i],fc[i],fu[i], temp[i]);
+                   PLL_SHFT(ax[i],bx[i],cx[i],u[i]);
+                   PLL_SHFT(fa[i],fb[i],fc[i],fu[i]);
+                   state[i] = 0;
+                   break;
+                 default:
+                   assert(0);
+                 }
+             }
+         }
+    }
+   
+
+   assert(0);
+   rax_free(converged);
+   rax_free(ulim);
+   rax_free(u);
+   rax_free(r);
+   rax_free(q);
+   rax_free(fu);
+   rax_free(dum); 
+   rax_free(temp);
+   rax_free(state);   
+   rax_free(endState);
+
+  
+
+   return(0);
+}
+
+/*******************************************************************************************************/
+/******** LG4X ***************************************************************************************/
+
+void pllOptLG4X(pllInstance *tr, partitionList * pr, double modelEpsilon,
+        linkageList *ll, int numberOfModels)
+{
+    int i;
+    double lg4xScaler, *lg4xScalers = (double *) calloc(pr->numberOfPartitions,
+            sizeof(double)), wgtsum = 0.0;
+    for (i = 0; i < 4; i++)
+        optParamGeneric(tr, pr, modelEpsilon, ll, numberOfModels, i, PLL_LG4X_RATE_MIN,
+                PLL_LG4X_RATE_MAX, LXRATE_F);
+    for (i = 0; i < pr->numberOfPartitions; i++)
+        lg4xScalers[i] = 1.0;
+    for (i = 0; i < ll->entries; i++)
+    {
+        if (ll->ld[i].valid)
+        {
+            int j, index = ll->ld[i].partitionList[0];
+            double averageRate = 0.0;
+            assert(ll->ld[i].partitions == 1);
+            for (j = 0; j < 4; j++)
+                averageRate += pr->partitionData[index]->gammaRates[j];
+            averageRate /= 4.0;
+            lg4xScalers[index] = averageRate;
+        }
+    }
+    if (pr->numberOfPartitions > 1)
+    {
+        for (i = 0; i < pr->numberOfPartitions; i++)
+            pr->partitionData[i]->fracchange = pr->partitionData[i]->rawFracchange * (1.0 / lg4xScalers[i]);
+    }
+    for (i = 0; i < pr->numberOfPartitions; i++)
+        wgtsum += (double) pr->partitionData[i]->partitionWeight;
+    lg4xScaler = 0.0;
+    for (i = 0; i < pr->numberOfPartitions; i++)
+    {
+        double fraction = (double) pr->partitionData[i]->partitionWeight / wgtsum;
+        lg4xScaler += (fraction * lg4xScalers[i]);
+    }
+    tr->fracchange = tr->rawFracchange * (1.0 / lg4xScaler);
+    free(lg4xScalers);
+}
+
+/**********************************************************************************************************/
+/* ALPHA PARAM ********************************************************************************************/
+
+
+//this function is required for implementing the LG4X model later-on 
+
+/** @brief Optimize alpha rates
+  *
+  * Generic routine for alpha rates optimization
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @param modelEpsilon
+  *   Don't know yet
+  *
+  * @param ll
+  *   Linkage list
+  *
+  * @todo
+  *   Implement the LG4X model
+  */
+void pllOptAlphasGeneric(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll)
+{
+  int 
+    i,
+    non_LG4X_Partitions = 0,
+    LG4X_Partitions  = 0;
+
+  /* assumes homogeneous super-partitions, that either contain DNA or AA partitions !*/
+  /* does not check whether AA are all linked */
+
+  /* first do non-LG4X partitions */
+
+  for(i = 0; ll && i < ll->entries; i++)
+    {
+      switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType)
+        {
+        case PLL_DNA_DATA:                          
+        case PLL_BINARY_DATA:
+        case PLL_SECONDARY_DATA:
+        case PLL_SECONDARY_DATA_6:
+        case PLL_SECONDARY_DATA_7:
+        case PLL_GENERIC_32:
+        case PLL_GENERIC_64:
+            if (pr->partitionData[ll->ld[i].partitionList[0]]->optimizeAlphaParameter)
+            {
+                ll->ld[i].valid = PLL_TRUE;
+                non_LG4X_Partitions++;
+            }
+            else
+                ll->ld[i].valid = PLL_FALSE;
+            break;
+        case PLL_AA_DATA:
+            if (pr->partitionData[ll->ld[i].partitionList[0]]->optimizeAlphaParameter)
+            {
+                if (pr->partitionData[ll->ld[i].partitionList[0]]->protModels == PLL_LG4X)
+                {
+                    LG4X_Partitions++;
+                    ll->ld[i].valid = PLL_FALSE;
+                }
+                else
+                {
+                    ll->ld[i].valid = PLL_TRUE;
+                    non_LG4X_Partitions++;
+                }
+            }
+            else
+                ll->ld[i].valid = PLL_FALSE;
+            break;
+        default:
+            assert(0);
+        }      
+    }   
+
+ 
+
+  if(non_LG4X_Partitions > 0)    
+    optParamGeneric(tr, pr, modelEpsilon, ll, non_LG4X_Partitions, -1, PLL_ALPHA_MIN, PLL_ALPHA_MAX, ALPHA_F);
+  
+  /* then LG4x partitions */
+
+  for(i = 0; ll && i < ll->entries; i++)
+    {
+      switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType)
+        {
+        case PLL_DNA_DATA:                          
+        case PLL_BINARY_DATA:
+        case PLL_SECONDARY_DATA:
+        case PLL_SECONDARY_DATA_6:
+        case PLL_SECONDARY_DATA_7:
+        case PLL_GENERIC_32:
+        case PLL_GENERIC_64:
+          ll->ld[i].valid = PLL_FALSE;    
+          break;
+        case PLL_AA_DATA:     
+          if(pr->partitionData[ll->ld[i].partitionList[0]]->protModels == PLL_LG4X)
+            ll->ld[i].valid = PLL_TRUE;
+          else
+            ll->ld[i].valid = PLL_FALSE;                    
+          break;
+        default:
+          assert(0);
+        }      
+    }   
+  
+  if(LG4X_Partitions > 0)
+    pllOptLG4X(tr, pr, modelEpsilon, ll, LG4X_Partitions);
+
+  for(i = 0; ll && i < ll->entries; i++)
+    ll->ld[i].valid = PLL_TRUE;
+}
+
+/** @brief Optimize model parameters
+  *
+  * Function for optimizing the \a rateNumber-th model parameter of type \a whichParameterTYpe,
+  * i.e. alpha rate, substitution rate, or base frequency rate, in all partitions with the \a
+  * valid flag set to \b PLL_TRUE.
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *   
+  * @param modelEpsilon
+  *    A parameter passed for Brent / Brak
+  *
+  * @param ll
+  *   Linkage list
+  * 
+  * @param numberOfModels
+  *   Number of partitions for which we are optimizing 
+  *
+  * @param rateNumber
+  *  Index of the parameter to optimize 
+  *
+  * @param lim_inf
+  *  Lower bound for the rate assignment
+  *
+  * @param lim_sup
+  *  Upper bound for the rate assignment
+  *
+  * @param whichParameterType
+  *  Type of the model parameter. Possible values are \b ALPHA_F, \b RATE_F and \b FREQ_F
+  *
+  * @todo
+  *    Describe the modelEpsilon parameter in detail
+  */
+static void optParamGeneric(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll, int numberOfModels, int rateNumber, double lim_inf, double lim_sup, int whichParameterType)
+{
+  int
+    l,
+    k, 
+    j, 
+    pos;
+
+  double
+    *startRates     = (double *)rax_malloc(sizeof(double) * numberOfModels * 4),
+    *startWeights   = (double *)rax_malloc(sizeof(double) * numberOfModels * 4),
+    *startExponents = (double *)rax_malloc(sizeof(double) * numberOfModels * 4),
+    *startValues = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *startLH     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *endLH       = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_a          = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_b          = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_c          = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_fa         = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_fb         = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_fc         = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_param      = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_x          = (double *)rax_malloc(sizeof(double) * numberOfModels);
+   
+  pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+    if (whichParameterType == LXWEIGHT_F)
+        pllEvaluateLikelihood (tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+    else
+    {
+        pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+        if (whichParameterType == LXRATE_F)
+        {
+            int j;
+            for (j = 0; j < pr->numberOfPartitions; j++)
+                pr->partitionData[j]->lg4x_weightLikelihood = pr->partitionData[j]->partitionLH;
+        }
+    }
+  
+#ifdef  _DEBUG_MOD_OPT
+  double
+    initialLH = tr->likelihood;
+#endif
+
+  /* 
+     at this point here every worker has the traversal data it needs for the 
+     search 
+  */
+
+  /* store in startValues the values of the old parameters */
+  for(l = 0, pos = 0; ll && l < ll->entries; l++)
+    {
+      if(ll->ld[l].valid)
+        {
+          endLH[pos] = PLL_UNLIKELY;
+          startLH[pos] = 0.0;
+
+          for(j = 0; j < ll->ld[l].partitions; j++)
+            {
+              int 
+                index = ll->ld[l].partitionList[j];
+              
+              startLH[pos] += pr->partitionData[index]->partitionLH;
+              
+              switch(whichParameterType)
+                {
+                case ALPHA_F:
+                  startValues[pos] = pr->partitionData[index]->alpha;
+                  break;
+                case RATE_F:
+                  startValues[pos] = pr->partitionData[index]->substRates[rateNumber];      
+                  break;
+                case FREQ_F:
+                  startValues[pos] = pr->partitionData[index]->freqExponents[rateNumber];
+                  break;
+                case LXRATE_F:
+                    assert(rateNumber >= 0 && rateNumber < 4);
+                    startValues[pos] =
+                            pr->partitionData[index]->gammaRates[rateNumber];
+                    memcpy(&startRates[pos * 4],
+                            pr->partitionData[index]->gammaRates,
+                            4 * sizeof(double));
+                    memcpy(&startExponents[pos * 4],
+                            pr->partitionData[index]->lg4x_weightExponents,
+                            4 * sizeof(double));
+                    memcpy(&startWeights[pos * 4],
+                            pr->partitionData[index]->lg4x_weights,
+                            4 * sizeof(double));
+                    break;
+                case LXWEIGHT_F:
+                    assert(rateNumber >= 0 && rateNumber < 4);
+                    startValues[pos] =
+                            pr->partitionData[index]->lg4x_weightExponents[rateNumber];
+                    break;
+                default:
+                  assert(0);
+                }
+            }
+          pos++;
+        }
+    }  
+
+  assert(pos == numberOfModels);
+   
+  for(k = 0, pos = 0; ll && k < ll->entries; k++)
+    {
+      if(ll->ld[k].valid)
+        {
+          _a[pos] = startValues[pos] + 0.1;
+          _b[pos] = startValues[pos] - 0.1;
+
+          if(_a[pos] < lim_inf) 
+            _a[pos] = lim_inf;
+          
+          if(_a[pos] > lim_sup) 
+            _a[pos] = lim_sup;
+              
+          if(_b[pos] < lim_inf) 
+            _b[pos] = lim_inf;
+          
+          if(_b[pos] > lim_sup) 
+            _b[pos] = lim_sup;    
+
+          pos++;
+        }
+    }                                
+
+  assert(pos == numberOfModels);
+
+  brakGeneric(_param, _a, _b, _c, _fa, _fb, _fc, lim_inf, lim_sup, numberOfModels, rateNumber, whichParameterType, tr, pr, ll, modelEpsilon);
+      
+  for(k = 0; k < numberOfModels; k++)
+    {
+      assert(_a[k] >= lim_inf && _a[k] <= lim_sup);
+      assert(_b[k] >= lim_inf && _b[k] <= lim_sup);       
+      assert(_c[k] >= lim_inf && _c[k] <= lim_sup);         
+    }      
+
+  brentGeneric(_a, _b, _c, _fb, modelEpsilon, _x, endLH, numberOfModels, whichParameterType, rateNumber, tr,  pr, ll, lim_inf, lim_sup);
+        
+  for(k = 0, pos = 0; ll && k < ll->entries; k++)
+    {
+      if(ll->ld[k].valid)
+        { 
+          if(startLH[pos] > endLH[pos])
+            {
+              //if the initial likelihood was better than the likelihodo after optimization, we set the values back 
+              //to their original values 
+
+              for(j = 0; j < ll->ld[k].partitions; j++)
+                {
+                  int 
+                    index = ll->ld[k].partitionList[j];
+                  
+                  if (whichParameterType == LXRATE_F)
+                    {
+                        memcpy(pr->partitionData[index]->lg4x_weights,
+                                &startWeights[pos * 4], sizeof(double) * 4);
+                        memcpy(pr->partitionData[index]->gammaRates,
+                                &startRates[pos * 4], sizeof(double) * 4);
+                        memcpy(pr->partitionData[index]->lg4x_weightExponents,
+                                &startExponents[pos * 4], 4 * sizeof(double));
+                    }
+
+                    changeModelParameters(index, rateNumber, startValues[pos], whichParameterType, tr, pr); 
+                }
+            }
+          else
+            {
+              //otherwise we set the value to the optimized value 
+              //this used to be a bug in standard RAxML, before I fixed it 
+              //I was not using _x[pos] as value that needs to be set 
+
+              for(j = 0; j < ll->ld[k].partitions; j++)
+                {
+                  int 
+                    index = ll->ld[k].partitionList[j];
+                  
+                  changeModelParameters(index, rateNumber, _x[pos], whichParameterType, tr, pr);
+
+                  if (whichParameterType == LXWEIGHT_F)
+                    {
+                        if (endLH[pos]
+                                > pr->partitionData[index]->lg4x_weightLikelihood)
+                        {
+                            memcpy(pr->partitionData[index]->lg4x_weightsBuffer,
+                                    pr->partitionData[index]->lg4x_weights,
+                                    sizeof(double) * 4);
+                            memcpy(
+                                    pr->partitionData[index]->lg4x_weightExponentsBuffer,
+                                    pr->partitionData[index]->lg4x_weightExponents,
+                                    sizeof(double) * 4);
+                            pr->partitionData[index]->lg4x_weightLikelihood =
+                                    endLH[pos];
+                        }
+                    }
+                    if (whichParameterType == LXRATE_F)
+                    {
+                        memcpy(pr->partitionData[index]->lg4x_weights,
+                                pr->partitionData[index]->lg4x_weightsBuffer,
+                                sizeof(double) * 4);
+                        memcpy(pr->partitionData[index]->lg4x_weightExponents,
+                                pr->partitionData[index]->lg4x_weightExponentsBuffer,
+                                sizeof(double) * 4);
+                    }
+                }
+            }
+          pos++;
+        }
+    }
+
+  #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+      if (whichParameterType == LXRATE_F || whichParameterType == LXWEIGHT_F) {
+        pllMasterBarrier(tr, pr, PLL_THREAD_COPY_LG4X_RATES);
+      } else {
+        pllMasterBarrier(tr, pr, PLL_THREAD_COPY_RATES);
+      }
+
+//    switch(whichParameterType)
+//      {
+//      case FREQ_F:
+//      case RATE_F:
+//          pllMasterBarrier(tr, pr, PLL_THREAD_COPY_RATES);
+//        break;
+//      case ALPHA_F:
+//          pllMasterBarrier(tr, pr, PLL_THREAD_COPY_ALPHA);
+//        break;
+//      case LXRATE_F:
+//      case LXWEIGHT_F:
+//          pllMasterBarrier(tr, pr, PLL_THREAD_COPY_LG4X_RATES);
+//        break;
+//      default:
+//        assert(0);
+//      }
+
+  #endif    
+
+    
+  assert(pos == numberOfModels);
+
+  rax_free(startLH);
+  rax_free(endLH);
+  rax_free(_a);
+  rax_free(_b);
+  rax_free(_c);
+  rax_free(_fa);
+  rax_free(_fb);
+  rax_free(_fc);
+  rax_free(_param);
+  rax_free(_x);
+  rax_free(startValues);
+  rax_free(startRates);
+  rax_free(startWeights);
+  rax_free(startExponents);
+
+#ifdef _DEBUG_MOD_OPT
+  pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+  if(tr->likelihood < initialLH)
+    printf("%f %f\n", tr->likelihood, initialLH);
+  assert(tr->likelihood >= initialLH);
+#endif
+}
+
+//******************** rate optimization functions ***************************************************/
+
+/** @brief Wrapper function for optimizing base frequency rates
+  *
+  * Wrapper function for optimizing base frequency rates of \a numberOfModels partitions. 
+  * The function iteratively calls the function \a optParamGeneric for optimizing each of the \a states
+  * parameters
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @param modelEpsilon
+  *   Dont know yet
+  *
+  * @param ll
+  *   Linkage list
+  *
+  * @param numberOfModels
+  *   Number of partitions that we are optimizing
+  *
+  * @param states
+  *   Number of states
+  */
+static void optFreqs(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll, int numberOfModels, int states)
+{ 
+  int 
+    rateNumber;
+
+  double
+    freqMin = -1000000.0,
+    freqMax = 200.0;
+  
+  for(rateNumber = 0; rateNumber < states; rateNumber++)
+    optParamGeneric(tr, pr, modelEpsilon, ll, numberOfModels, rateNumber, freqMin, freqMax, FREQ_F);   
+}
+
+/** @brief Optimize base frequencies 
+ *  
+ *  Wrapper function for optimizing base frequencies
+ *
+ *  @param tr
+ *    PLL instance
+ *
+ *  @param pr
+ *    List of partitions
+ *
+ *  @param modelEpsilon
+ *    
+ *
+ *  @param ll
+ *    Linkage list
+ *
+ */
+void pllOptBaseFreqs(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll)
+{
+  int 
+    i,
+    states,
+    dnaPartitions = 0,
+    aaPartitions  = 0,
+    binPartitions = 0;
+
+  /* first do DNA */
+
+  /* Set the valid flag in linkage list to PLL_TRUE for all DNA partitions */
+  for(i = 0; ll && i < ll->entries; i++)
+    {
+      switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType)
+        {
+        case PLL_DNA_DATA:  
+          states = pr->partitionData[ll->ld[i].partitionList[0]]->states; 
+          if(pr->partitionData[ll->ld[i].partitionList[0]]->optimizeBaseFrequencies)
+            {
+              ll->ld[i].valid = PLL_TRUE;
+              dnaPartitions++;              
+            }
+          else
+             ll->ld[i].valid = PLL_FALSE;
+          break;       
+        case PLL_BINARY_DATA:
+        case PLL_AA_DATA:
+          ll->ld[i].valid = PLL_FALSE;
+          break;
+        default:
+          assert(0);
+        }      
+    }   
+
+  /* Optimize the frequency rates of all DNA partitions */
+  if(dnaPartitions > 0)
+    optFreqs(tr, pr, modelEpsilon, ll, dnaPartitions, states);
+  
+  /* then AA */
+
+  /* find all partitions that have frequency optimization enabled */ 
+  for(i = 0; ll && i < ll->entries; i++)
+    {
+      switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType)
+        {
+        case PLL_AA_DATA:
+          states = pr->partitionData[ll->ld[i].partitionList[0]]->states;             
+          if(pr->partitionData[ll->ld[i].partitionList[0]]->optimizeBaseFrequencies)
+            {
+              ll->ld[i].valid = PLL_TRUE;
+              aaPartitions++;           
+            }
+          else
+            ll->ld[i].valid = PLL_FALSE; 
+          break;
+        case PLL_DNA_DATA:      
+        case PLL_BINARY_DATA:
+          ll->ld[i].valid = PLL_FALSE;
+          break;
+        default:
+          assert(0);
+        }        
+    }
+
+  if(aaPartitions > 0)      
+    optFreqs(tr, pr, modelEpsilon, ll, aaPartitions, states);
+
+  /* then binary */
+  for(i = 0; i < ll->entries; i++)
+    {
+      switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType)
+	{
+	case PLL_BINARY_DATA:	  
+	  states = pr->partitionData[ll->ld[i].partitionList[0]]->states; 	      
+	  if(pr->partitionData[ll->ld[i].partitionList[0]]->optimizeBaseFrequencies)
+	    {
+	      ll->ld[i].valid = PLL_TRUE;
+	      binPartitions++;		
+	    }
+	  else
+	    ll->ld[i].valid = PLL_FALSE; 
+	  break;
+	case PLL_DNA_DATA:	  
+	case PLL_AA_DATA:      
+	case PLL_SECONDARY_DATA:
+	case PLL_SECONDARY_DATA_6:
+	case PLL_SECONDARY_DATA_7:
+	case PLL_GENERIC_32:
+	case PLL_GENERIC_64:	    
+	  ll->ld[i].valid = PLL_FALSE;
+	  break;
+	default:
+	  assert(0);
+	}	 
+    }
+
+  if(binPartitions > 0)      
+    optFreqs(tr, pr, modelEpsilon, ll, binPartitions, states);
+
+  /* done */
+
+  for(i = 0; ll && i < ll->entries; i++)
+    ll->ld[i].valid = PLL_TRUE;
+}
+
+
+
+/* new version for optimizing rates, an external loop that iterates over the rates */
+/** @brief Wrapper function for optimizing substitution rates
+  *
+  * Wrapper function for optimizing substitution rates of \a numberOfModels partitions. 
+  * The function determines the  number of free parameters and iteratively calls the 
+  * function \a optParamGeneric for optimizing each parameter
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @param modelEpsilon
+  *   Dont know yet
+  *
+  * @param ll
+  *   Linkage list
+  *
+  * @param numberOfModels
+  *   Number of partitions that we are optimizing
+  *
+  * @param states
+  *   Number of states
+  */
+static void optRates(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll, int numberOfModels, int states)
+{
+  int
+    rateNumber,
+    numberOfRates = ((states * states - states) / 2) - 1;
+
+  for(rateNumber = 0; rateNumber < numberOfRates; rateNumber++)
+    optParamGeneric(tr, pr, modelEpsilon, ll, numberOfModels, rateNumber, PLL_RATE_MIN, PLL_RATE_MAX, RATE_F);
+}
+
+
+/* figure out if all AA models have been assigned a joint GTR matrix */
+
+/** @brief Check whether all protein partitions have been assigned a joint GTR matrix
+  *
+  * Check whether there exists at least one protein partition and whether all
+  * protein partitions have been assigned a joint GTR matrix.
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @return
+  *   Return \b PLL_TRUE in case there exists at least one protein partition and all of
+  *   protein partitions are assigned a joint GTR matrix. Otherwise return \b PLL_FALSE
+  */
+static pllBoolean AAisGTR(partitionList *pr)
+{
+  int i, count = 0;
+
+  for(i = 0; i < pr->numberOfPartitions; i++)
+    {
+      if(pr->partitionData[i]->dataType == PLL_AA_DATA)
+        {
+          count++;
+          if(pr->partitionData[i]->protModels != PLL_GTR)
+            return PLL_FALSE;
+        }
+    }
+
+  if(count == 0)
+    return PLL_FALSE;
+
+  return PLL_TRUE;
+}
+
+
+/* generic substitiution matrix (Q matrix) optimization */
+
+/** @brief Optimize substitution rates
+  *
+  * Generic routine for substitution matrix (Q matrix) optimization
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @param modelEpsilon
+  *   Don't know yet
+  *
+  * @param ll
+  *   Linkage list
+  */
+void pllOptRatesGeneric(pllInstance *tr, partitionList *pr, double modelEpsilon, linkageList *ll)
+{
+  int 
+    i,
+    dnaPartitions = 0,
+    aaPartitions  = 0,
+    states = -1;
+
+  /* assumes homogeneous super-partitions, that either contain DNA or AA partitions !*/
+  /* does not check whether AA are all linked */
+
+  /* 
+     first optimize all rates in DNA data partition matrices. That's where we use the valid field in the 
+     linkage list data structure. 
+   */
+
+  for(i = 0; ll && i < ll->entries; i++)
+    {
+      switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType)
+        {
+          case PLL_DNA_DATA:  
+            states = pr->partitionData[ll->ld[i].partitionList[0]]->states;
+	    if(pr->partitionData[ll->ld[i].partitionList[0]]->optimizeSubstitutionRates)
+	      {
+		ll->ld[i].valid = PLL_TRUE;
+		++ dnaPartitions;  
+	      }
+	    else	      
+	      ll->ld[i].valid = PLL_FALSE;	      
+            break;
+          case PLL_BINARY_DATA:
+          case PLL_AA_DATA:
+          case PLL_SECONDARY_DATA:
+          case PLL_SECONDARY_DATA_6:
+          case PLL_SECONDARY_DATA_7:
+          case PLL_GENERIC_32:
+          case PLL_GENERIC_64:
+            ll->ld[i].valid = PLL_FALSE;
+            break;
+          default:
+            assert(0);
+        }      
+    }   
+
+  /* if we have dna partitions in our dataset, let's optimize all 5 rates in their substitution matrices */
+
+  if(dnaPartitions > 0)
+    optRates(tr, pr, modelEpsilon, ll, dnaPartitions, states);
+  
+  /* AA partitions evolving under a GTR model do not need to be linked any more, this responsibility now remains 
+     with the library user !
+   */
+  
+  for(i = 0; ll && i < ll->entries; i++)
+    {
+      switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType)
+	{
+	case PLL_AA_DATA:
+	  states = pr->partitionData[ll->ld[i].partitionList[0]]->states;
+	  if(pr->partitionData[ll->ld[i].partitionList[0]]->optimizeSubstitutionRates)
+	    {
+	      ll->ld[i].valid = PLL_TRUE;
+	      aaPartitions++;
+	    }
+	  else
+	    ll->ld[i].valid = PLL_FALSE;
+	  break;
+	case PLL_DNA_DATA:          
+	case PLL_BINARY_DATA:
+	case PLL_SECONDARY_DATA:        
+	case PLL_SECONDARY_DATA_6:
+	case PLL_SECONDARY_DATA_7:
+	  ll->ld[i].valid = PLL_FALSE;
+	  break;
+	default:
+	  assert(0);
+	}    
+    }
+  
+  if(aaPartitions > 0)
+    optRates(tr, pr, modelEpsilon, ll, aaPartitions, states); 
+
+  /* done with all partitions, so we can set all entries in the linkage list to valid again :-) */
+
+  for(i = 0; ll && i < ll->entries; i++)
+    ll->ld[i].valid = PLL_TRUE;
+}
+
+
+
+
+
+/*********************FUNCTIONS FOR PSR/CAT model of rate heterogeneity ***************************************/
+
+
+
+
+
+
+static int catCompare(const void *p1, const void *p2)
+{
+ rateCategorize *rc1 = (rateCategorize *)p1;
+ rateCategorize *rc2 = (rateCategorize *)p2;
+
+  double i = rc1->accumulatedSiteLikelihood;
+  double j = rc2->accumulatedSiteLikelihood;
+  
+  if (i > j)
+    return (1);
+  if (i < j)
+    return (-1);
+  return (0);
+}
+
+
+static void categorizePartition(pllInstance *tr, partitionList *pr, rateCategorize *rc, int model, int lower, int upper)
+{
+  int
+    zeroCounter,
+    i, 
+    k;
+  
+  double 
+    diff, 
+    min;
+
+  for (i = lower, zeroCounter = 0; i < upper; i++, zeroCounter++) 
+      {
+        double
+          temp = tr->patrat[i];
+
+        int
+          found = 0;
+        
+        for(k = 0; k < pr->partitionData[model]->numberOfCategories; k++)
+          {
+            if(temp == rc[k].rate || (fabs(temp - rc[k].rate) < 0.001))
+              {
+                found = 1;
+                tr->rateCategory[i] = k; 
+                break;
+              }
+          }
+        
+        if(!found)
+          {
+            min = fabs(temp - rc[0].rate);
+            tr->rateCategory[i] = 0;
+
+            for(k = 1; k < pr->partitionData[model]->numberOfCategories; k++)
+              {
+                diff = fabs(temp - rc[k].rate);
+
+                if(diff < min)
+                  {
+                    min = diff;
+                    tr->rateCategory[i] = k;
+                  }
+              }
+          }
+      }
+
+  for(k = 0; k < pr->partitionData[model]->numberOfCategories; k++)
+    pr->partitionData[model]->perSiteRates[k] = rc[k].rate;
+}
+
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+
+void optRateCatPthreads(pllInstance *tr, partitionList *pr, double lower_spacing, double upper_spacing, double *lhs, int n, int tid)
+{
+  int 
+    model, 
+    i;
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {      
+      int 
+        localIndex = 0;
+
+      pllBoolean 
+        execute = ((tr->manyPartitions && isThisMyPartition(pr, tid, model)) || (!tr->manyPartitions));
+
+      if(execute)
+        for(i = pr->partitionData[model]->lower;  i < pr->partitionData[model]->upper; i++)
+          {
+            if(tr->manyPartitions || (i % n == tid))
+              {
+              
+                double initialRate, initialLikelihood, 
+                  leftLH, rightLH, leftRate, rightRate, v;
+                const double epsilon = 0.00001;
+                int k;        
+                
+                tr->patrat[i] = tr->patratStored[i];     
+                initialRate = tr->patrat[i];
+                
+                initialLikelihood = evaluatePartialGeneric(tr, pr, localIndex, initialRate, model); /* i is real i ??? */
+                
+                
+                leftLH = rightLH = initialLikelihood;
+                leftRate = rightRate = initialRate;
+                
+                k = 1;
+                
+                while((initialRate - k * lower_spacing > 0.0001) && 
+                      ((v = evaluatePartialGeneric(tr, pr, localIndex, initialRate - k * lower_spacing, model))
+                       > leftLH) && 
+                      (fabs(leftLH - v) > epsilon))  
+                  {       
+#ifndef WIN32
+                    if(isnan(v))
+                      assert(0);
+#endif
+                    
+                    leftLH = v;
+                    leftRate = initialRate - k * lower_spacing;
+                    k++;          
+                  }      
+                
+                k = 1;
+                
+                while(((v = evaluatePartialGeneric(tr, pr, localIndex, initialRate + k * upper_spacing, model)) > rightLH) &&
+                      (fabs(rightLH - v) > epsilon))            
+                  {
+#ifndef WIN32
+                    if(isnan(v))
+                      assert(0);
+#endif     
+                    rightLH = v;
+                    rightRate = initialRate + k * upper_spacing;         
+                    k++;
+                  }           
+                
+                if(rightLH > initialLikelihood || leftLH > initialLikelihood)
+                  {
+                    if(rightLH > leftLH)            
+                      {      
+                        tr->patrat[i] = rightRate;
+                        lhs[i] = rightLH;
+                      }
+                    else
+                      {       
+                        tr->patrat[i] = leftRate;
+                        lhs[i] = leftLH;
+                      }
+                  }
+                else
+                  lhs[i] = initialLikelihood;
+                
+                tr->patratStored[i] = tr->patrat[i];
+                localIndex++;
+              }
+          }
+      assert(localIndex == pr->partitionData[model]->width);
+    }
+}
+
+
+
+#else
+
+/** @brief Optimize rates for CAT model
+ *
+ *  @param tr
+ *    PLL instance
+ *
+ *  @param pr
+ *    List of partitions
+ *
+ *  @param model
+ *    Partition index
+ *
+ *  @param lower_specing
+ *
+ *  @param upper_spacing
+ *
+ *  @param lhs
+ */
+static void optRateCatModel(pllInstance *tr, partitionList *pr, int model, double lower_spacing, double upper_spacing, double *lhs)
+{
+  int lower = pr->partitionData[model]->lower;
+  int upper = pr->partitionData[model]->upper;
+  int i;
+  for(i = lower; i < upper; i++)
+    {
+      double initialRate, initialLikelihood, 
+        leftLH, rightLH, leftRate, rightRate, v;
+      const double epsilon = 0.00001;
+      int k;
+      
+      tr->patrat[i] = tr->patratStored[i];     
+      initialRate = tr->patrat[i];
+      
+      initialLikelihood = evaluatePartialGeneric(tr, pr, i, initialRate, model);
+      
+      
+      leftLH = rightLH = initialLikelihood;
+      leftRate = rightRate = initialRate;
+      
+      k = 1;
+      
+      while((initialRate - k * lower_spacing > 0.0001) && 
+            ((v = evaluatePartialGeneric(tr, pr, i, initialRate - k * lower_spacing, model))
+             > leftLH) && 
+            (fabs(leftLH - v) > epsilon))  
+        {         
+#ifndef WIN32
+          if(isnan(v))
+            assert(0);
+#endif
+          
+          leftLH = v;
+          leftRate = initialRate - k * lower_spacing;
+          k++;    
+        }      
+      
+      k = 1;
+      
+      while(((v = evaluatePartialGeneric(tr, pr, i, initialRate + k * upper_spacing, model)) > rightLH) &&
+            (fabs(rightLH - v) > epsilon))      
+        {
+#ifndef WIN32
+          if(isnan(v))
+            assert(0);
+#endif     
+          rightLH = v;
+          rightRate = initialRate + k * upper_spacing;   
+          k++;
+        }           
+  
+      if(rightLH > initialLikelihood || leftLH > initialLikelihood)
+        {
+          if(rightLH > leftLH)      
+            {        
+              tr->patrat[i] = rightRate;
+              lhs[i] = rightLH;
+            }
+          else
+            {         
+              tr->patrat[i] = leftRate;
+              lhs[i] = leftLH;
+            }
+        }
+      else
+        lhs[i] = initialLikelihood;
+      
+      tr->patratStored[i] = tr->patrat[i];
+    }
+
+}
+
+
+#endif
+
+
+
+/* 
+   set scaleRates to PLL_FALSE everywhere such that 
+   per-site rates are not scaled to obtain an overall mean rate 
+   of 1.0
+*/
+
+void updatePerSiteRates(pllInstance *tr, partitionList *pr, pllBoolean scaleRates)
+{
+  int 
+    i,
+    model;
+
+  if(pr->perGeneBranchLengths && pr->numberOfPartitions > 1)
+    {            
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        {
+          int          
+            lower = pr->partitionData[model]->lower,
+            upper = pr->partitionData[model]->upper;
+          
+          if(scaleRates)
+            {
+              double 
+                scaler = 0.0,       
+                accRat = 0.0; 
+
+              int 
+                accWgt     = 0;
+              
+              for(i = lower; i < upper; i++)
+                {
+                  int 
+                    w = tr->aliaswgt[i];
+                  
+                  double
+                    rate = pr->partitionData[model]->perSiteRates[tr->rateCategory[i]];
+                  
+                  assert(0 <= tr->rateCategory[i] && tr->rateCategory[i] < tr->maxCategories);
+                  
+                  accWgt += w;
+                  
+                  accRat += (w * rate);
+                }          
+          
+              accRat /= ((double)accWgt);
+          
+              scaler = 1.0 / ((double)accRat);
+                  
+              for(i = 0; i < pr->partitionData[model]->numberOfCategories; i++)
+                pr->partitionData[model]->perSiteRates[i] *= scaler;
+
+              accRat = 0.0;      
+              
+              for(i = lower; i < upper; i++)
+                {
+                  int 
+                    w = tr->aliaswgt[i];
+                  
+                  double
+                    rate = pr->partitionData[model]->perSiteRates[tr->rateCategory[i]];
+                  
+                  assert(0 <= tr->rateCategory[i] && tr->rateCategory[i] < tr->maxCategories);        
+                  
+                  accRat += (w * rate);
+                }                
+
+              accRat /= ((double)accWgt);         
+
+              assert(PLL_ABS(1.0 - accRat) < 1.0E-5);
+            }
+          else
+            {
+              double               
+                accRat = 0.0; 
+
+              int 
+                accWgt     = 0;
+              
+              for(i = lower; i < upper; i++)
+                {
+                  int 
+                    w = tr->aliaswgt[i];
+                  
+                  double
+                    rate = pr->partitionData[model]->perSiteRates[tr->rateCategory[i]];
+                  
+                  assert(0 <= tr->rateCategory[i] && tr->rateCategory[i] < tr->maxCategories);
+                  
+                  accWgt += w;
+                  
+                  accRat += (w * rate);
+                }          
+          
+              accRat /= ((double)accWgt);
+              
+              assert(PLL_ABS(1.0 - accRat) < 1.0E-5);
+            }
+
+          
+#if NOT (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+          {
+            int 
+              localCount = 0;
+            
+            for(i = lower, localCount = 0; i < upper; i++, localCount++)
+              {               
+                pr->partitionData[model]->rateCategory[localCount] = tr->rateCategory[i];
+              }
+          }
+#endif
+        }
+    }
+  else
+    {
+      int
+        accWgt = 0;
+
+      double 
+        scaler = 0.0,       
+        accRat = 0.0; 
+
+      if(scaleRates)
+        {
+          for(model = 0, accRat = 0.0, accWgt = 0; model < pr->numberOfPartitions; model++)
+            {
+              int 
+                localCount = 0,
+                lower = pr->partitionData[model]->lower,
+                upper = pr->partitionData[model]->upper;
+              
+              for(i = lower, localCount = 0; i < upper; i++, localCount++)
+                {
+                  int 
+                    w = tr->aliaswgt[i];
+                  
+                  double
+                    rate = pr->partitionData[model]->perSiteRates[tr->rateCategory[i]];
+                  
+                  assert(0 <= tr->rateCategory[i] && tr->rateCategory[i] < tr->maxCategories);
+                  
+                  accWgt += w;
+                  
+                  accRat += (w * rate);
+                }
+            }
+          
+          accRat /= ((double)accWgt);
+          
+          scaler = 1.0 / ((double)accRat);
+          
+          for(model = 0; model < pr->numberOfPartitions; model++)
+            {
+              for(i = 0; i < pr->partitionData[model]->numberOfCategories; i++)
+                pr->partitionData[model]->perSiteRates[i] *= scaler;
+            }
+
+          for(model = 0, accRat = 0.0; model < pr->numberOfPartitions; model++)
+            {
+              int 
+                localCount = 0,
+                lower = pr->partitionData[model]->lower,
+                upper = pr->partitionData[model]->upper;
+              
+              for(i = lower, localCount = 0; i < upper; i++, localCount++)
+                {
+                  int 
+                    w = tr->aliaswgt[i];
+                  
+                  double
+                    rate = pr->partitionData[model]->perSiteRates[tr->rateCategory[i]];
+                  
+                  assert(0 <= tr->rateCategory[i] && tr->rateCategory[i] < tr->maxCategories);        
+                  
+                  accRat += (w * rate);
+                }
+            }           
+
+          accRat /= ((double)accWgt);     
+
+          assert(PLL_ABS(1.0 - accRat) < 1.0E-5);
+        }
+      else
+        {
+          for(model = 0, accRat = 0.0, accWgt = 0; model < pr->numberOfPartitions; model++)
+            {
+              int 
+                localCount = 0,
+                lower = pr->partitionData[model]->lower,
+                upper = pr->partitionData[model]->upper;
+              
+              for(i = lower, localCount = 0; i < upper; i++, localCount++)
+                {
+                  int 
+                    w = tr->aliaswgt[i];
+                  
+                  double
+                    rate = pr->partitionData[model]->perSiteRates[tr->rateCategory[i]];
+                  
+                  assert(0 <= tr->rateCategory[i] && tr->rateCategory[i] < tr->maxCategories);
+                  
+                  accWgt += w;
+                  
+                  accRat += (w * rate);
+                }
+            }
+          
+          accRat /=  (double)accWgt;
+
+          assert(PLL_ABS(1.0 - accRat) < 1.0E-5);
+        }
+         
+         /*
+       for(model = 0; model < pr->numberOfPartitions; model++)
+        {
+          int 
+            localCount = 0,
+            lower = pr->partitionData[model]->lower,
+            upper = pr->partitionData[model]->upper;
+
+        }  */       
+#if NOT (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        {                        
+          int 
+            localCount,
+            lower = pr->partitionData[model]->lower,
+            upper = pr->partitionData[model]->upper;
+          
+          for(i = lower, localCount = 0; i < upper; i++, localCount++)
+              pr->partitionData[model]->rateCategory[localCount] = tr->rateCategory[i];
+        }
+#endif
+    }
+  
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  pllMasterBarrier(tr, pr, PLL_THREAD_COPY_RATE_CATS);
+#endif               
+}
+
+/** @brief Optimize rate categories for CAT model
+ *
+ *  Optimize rate categories for CAT model
+ *
+ *  @param tr
+ *    PLL instance
+ *
+ *  @param pr
+ *    List of partitions
+ *
+ *  @param _maxCategories
+ *    Number of categories
+ */
+static void optimizeRateCategories(pllInstance *tr, partitionList *pr, int _maxCategories)
+{
+  assert(_maxCategories > 0);
+
+  if(_maxCategories > 1)
+    {
+      double  
+        temp,  
+        lower_spacing, 
+        upper_spacing,
+        initialLH = tr->likelihood,     
+        *ratStored = (double *)rax_malloc(sizeof(double) * tr->originalCrunchedLength),
+        /**lhs =       (double *)malloc(sizeof(double) * tr->originalCrunchedLength),*/
+        **oldCategorizedRates = (double **)rax_malloc(sizeof(double *) * pr->numberOfPartitions);
+
+      int  
+        i,
+        k,
+        maxCategories = _maxCategories,
+        *oldCategory =  (int *)rax_malloc(sizeof(int) * tr->originalCrunchedLength),
+        model,
+        *oldNumbers = (int *)rax_malloc(sizeof(int) * pr->numberOfPartitions);
+  
+      assert(isTip(tr->start->number, tr->mxtips));         
+      
+      pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+      if(tr->optimizeRateCategoryInvocations == 1)
+        {
+          lower_spacing = 0.5 / ((double)(tr->optimizeRateCategoryInvocations));
+          upper_spacing = 1.0 / ((double)(tr->optimizeRateCategoryInvocations));
+        }
+      else
+        {
+          lower_spacing = 0.05 / ((double)(tr->optimizeRateCategoryInvocations));
+          upper_spacing = 0.1 / ((double)(tr->optimizeRateCategoryInvocations));
+        }
+      
+      if(lower_spacing < 0.001)
+        lower_spacing = 0.001;
+      
+      if(upper_spacing < 0.001)
+        upper_spacing = 0.001;
+      
+      tr->optimizeRateCategoryInvocations = tr->optimizeRateCategoryInvocations + 1;
+
+      memcpy(oldCategory, tr->rateCategory, sizeof(int) * tr->originalCrunchedLength);       
+      memcpy(ratStored,   tr->patratStored, sizeof(double) * tr->originalCrunchedLength);
+
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        {
+          oldNumbers[model]          = pr->partitionData[model]->numberOfCategories;
+
+          oldCategorizedRates[model] = (double *)rax_malloc(sizeof(double) * tr->maxCategories);
+          
+          memcpy(oldCategorizedRates[model], pr->partitionData[model]->perSiteRates, tr->maxCategories * sizeof(double));
+        }      
+      
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+      /*tr->lhs = lhs;*/
+      tr->lower_spacing = lower_spacing;
+      tr->upper_spacing = upper_spacing;
+      pllMasterBarrier(tr, pr, PLL_THREAD_RATE_CATS);
+#else      
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        optRateCatModel(tr, pr, model, lower_spacing, upper_spacing, tr->lhs);
+#endif     
+
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        {     
+          int 
+            where = 1,
+            found = 0,
+            width = pr->partitionData[model]->upper -  pr->partitionData[model]->lower,
+            upper = pr->partitionData[model]->upper,
+            lower = pr->partitionData[model]->lower;
+            
+          rateCategorize 
+            *rc = (rateCategorize *)rax_malloc(sizeof(rateCategorize) * width);          
+        
+          for (i = 0; i < width; i++)
+            {
+              rc[i].accumulatedSiteLikelihood = 0.0;
+              rc[i].rate = 0.0;
+            }  
+        
+          rc[0].accumulatedSiteLikelihood = tr->lhs[lower];
+          rc[0].rate = tr->patrat[lower];
+        
+          tr->rateCategory[lower] = 0;
+        
+          for (i = lower + 1; i < upper; i++) 
+            {
+              temp = tr->patrat[i];
+              found = 0;
+            
+              for(k = 0; k < where; k++)
+                {
+                  if(temp == rc[k].rate || (fabs(temp - rc[k].rate) < 0.001))
+                    {
+                      found = 1;                                                
+                      rc[k].accumulatedSiteLikelihood += tr->lhs[i];    
+                      break;
+                    }
+                }
+            
+              if(!found)
+                {           
+                  rc[where].rate = temp;            
+                  rc[where].accumulatedSiteLikelihood += tr->lhs[i];        
+                  where++;
+                }
+            }
+        
+          qsort(rc, where, sizeof(rateCategorize), catCompare);
+        
+          if(where < maxCategories)
+            {
+              pr->partitionData[model]->numberOfCategories = where;
+              categorizePartition(tr, pr, rc, model, lower, upper);
+            }
+          else
+            {
+              pr->partitionData[model]->numberOfCategories = maxCategories;
+              categorizePartition(tr, pr, rc, model, lower, upper);
+            }
+        
+          rax_free(rc);
+        }
+                
+      updatePerSiteRates(tr, pr, PLL_TRUE);
+
+      pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+      
+      if(tr->likelihood < initialLH)
+        {                         
+          for(model = 0; model < pr->numberOfPartitions; model++)
+            {
+              pr->partitionData[model]->numberOfCategories = oldNumbers[model];
+              memcpy(pr->partitionData[model]->perSiteRates, oldCategorizedRates[model], tr->maxCategories * sizeof(double));
+            }         
+          
+          memcpy(tr->patratStored, ratStored, sizeof(double) * tr->originalCrunchedLength);
+          memcpy(tr->rateCategory, oldCategory, sizeof(int) * tr->originalCrunchedLength);           
+          
+          updatePerSiteRates(tr, pr, PLL_FALSE);
+          
+          pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+          /* printf("REVERT: %1.40f %1.40f\n", initialLH, tr->likelihood); */
+
+          assert(initialLH == tr->likelihood);
+        }
+          
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        rax_free(oldCategorizedRates[model]);
+                   
+      rax_free(oldCategorizedRates);
+      rax_free(oldCategory);
+      rax_free(ratStored);       
+      /*     rax_free(lhs); */
+      rax_free(oldNumbers);
+    }
+}
+  
+
+/************************* end of functions for CAT model of rate heterogeneity */
+
+
+
+
+/*****************************************************************************************************/
+
+/* reset all branche lengths in tree to default values */
+
+/** @brief Reset all branch lengths to default values
+  
+    Reset all branch lengths in the tree instance to default values (\b PLL_DEFAULTZ)
+
+    @param tr
+      PLL instance
+  */
+void resetBranches(pllInstance *tr)
+{
+  nodeptr  p, q;
+  int  nodes, i;
+  
+  nodes = tr->mxtips  +  3 * (tr->mxtips - 2);
+  p = tr->nodep[1];
+  while (nodes-- > 0) 
+    {   
+	  p->z[0] = PLL_DEFAULTZ;
+	  if (tr->perGeneBranchLengths)
+        for(i = 1; i < PLL_NUM_BRANCHES; i++)
+          p->z[i] = PLL_DEFAULTZ;
+        
+      q = p->next;
+      while(q != p)
+        {       
+    	  q->z[0] = PLL_DEFAULTZ;
+    	  if (tr->perGeneBranchLengths)
+            for(i = 1; i < PLL_NUM_BRANCHES; i++)
+              q->z[i] = PLL_DEFAULTZ;
+          q = q->next;
+        }
+      p++;
+    }
+}
+
+/**
+ * @brief Adjust frequencies in case some base frequency is close to zero.
+ */
+static void smoothFrequencies(double *frequencies, int numberOfFrequencies) {
+	int countScale = 0, l, loopCounter = 0;
+
+	for (l = 0; l < numberOfFrequencies; l++)
+		if (frequencies[l] < PLL_FREQ_MIN)
+			countScale++;
+
+	if (countScale > 0) {
+		while (countScale > 0) {
+			double correction = 0.0, factor = 1.0;
+
+			for (l = 0; l < numberOfFrequencies; l++) {
+				if (frequencies[l] == 0.0)
+					correction += PLL_FREQ_MIN;
+				else if (frequencies[l] < PLL_FREQ_MIN) {
+					correction += (PLL_FREQ_MIN - frequencies[l]);
+					factor -= (PLL_FREQ_MIN - frequencies[l]);
+				}
+			}
+
+			countScale = 0;
+
+			for (l = 0; l < numberOfFrequencies; l++) {
+				if (frequencies[l] >= PLL_FREQ_MIN)
+					frequencies[l] = frequencies[l] - (frequencies[l] * correction * factor);
+				else
+					frequencies[l] = PLL_FREQ_MIN;
+
+				if (frequencies[l] < PLL_FREQ_MIN)
+					countScale++;
+			}
+			assert(loopCounter < 100);
+			loopCounter++;
+		}
+	}
+}
+
+/**
+ * @brief Evaluate all possible protein models
+ */
+static void optimizeProteinModels(pllInstance *tr, partitionList * pr, int *bestIndex, double *bestScores, pllBoolean empiricalFreqs)
+{
+	int modelIndex, partitionIndex,
+	    numProteinModels = PLL_AUTO;
+
+	for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+		bestIndex[partitionIndex] = -1;
+		bestScores[partitionIndex] = PLL_UNLIKELY;
+	}
+
+	if (empiricalFreqs) {
+		double ** freqs = pllBaseFrequenciesInstance(tr, pr);
+		for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+			smoothFrequencies(freqs[partitionIndex], PLL_NUM_AA_STATES);
+			memcpy(pr->partitionData[partitionIndex]->empiricalFrequencies, freqs[partitionIndex], PLL_NUM_AA_STATES*sizeof(double));
+		}
+		free(freqs);
+	}
+
+	for (modelIndex = 0; modelIndex < numProteinModels; modelIndex++) {
+		for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+			if (pr->partitionData[partitionIndex]->protModels == PLL_AUTO) {
+
+				pr->partitionData[partitionIndex]->autoProtModels = modelIndex;
+				pr->partitionData[partitionIndex]->protUseEmpiricalFreqs =
+						empiricalFreqs;
+
+				assert(!pr->partitionData[partitionIndex]->optimizeBaseFrequencies);
+
+				pllInitReversibleGTR(tr, pr, partitionIndex);
+			}
+		}
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+		pllMasterBarrier (tr, pr, PLL_THREAD_COPY_RATES);
+#endif
+
+		/* optimize branch lengths */
+		resetBranches(tr);
+		pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+		pllOptimizeBranchLengths(tr, pr, 16);
+
+		for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+			if (pr->partitionData[partitionIndex]->protModels == PLL_AUTO) {
+				if (pr->partitionData[partitionIndex]->partitionLH > bestScores[partitionIndex]) {
+					/* improved best score */
+					bestScores[partitionIndex] = pr->partitionData[partitionIndex]->partitionLH;
+					bestIndex[partitionIndex] = modelIndex;
+				}
+			}
+		}
+	}
+}
+
+/* 
+   automatically compute the best protein substitution model for the dataset at hand.
+ */
+
+/** @brief Compute the best protein substitution model
+  *
+  * Automatically compute the best protein substitution model for the dataset
+  * at hand
+  *
+  * @param tr
+  *   The PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *
+  */
+static void autoProtein(pllInstance *tr, partitionList *pr)
+{
+	int countAutos = 0, partitionIndex;
+
+	/* count the number of partitions with model set to PLL_AUTO */
+	for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++)
+		if (pr->partitionData[partitionIndex]->protModels == PLL_AUTO)
+			countAutos++;
+
+	/* if there are partitions with model set to PLL_AUTO compute the best model */
+	if (countAutos > 0) {
+		int *bestIndex = (int*) rax_malloc(
+				sizeof(int) * pr->numberOfPartitions),
+		    *bestIndexEmpFreqs = (int*) rax_malloc(
+				sizeof(int) * pr->numberOfPartitions),
+		    *oldIndex =
+				(int*) rax_malloc(sizeof(int) * pr->numberOfPartitions);
+
+		pllBoolean *oldFreqs = (pllBoolean*) malloc(
+				sizeof(pllBoolean) * pr->numberOfPartitions);
+
+		double startLH,
+		      *bestScores = (double*) rax_malloc(
+				sizeof(double) * pr->numberOfPartitions),
+			  *bestScoresEmpFreqs = (double*) rax_malloc(
+				sizeof(double) * pr->numberOfPartitions);
+
+		topolRELL_LIST *rl = (topolRELL_LIST *) rax_malloc(
+				sizeof(topolRELL_LIST));
+
+		initTL(rl, tr, 1);
+		saveTL(rl, tr, 0);
+
+		pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+		/* store the initial likelihood of the tree with the currently assigned protein models */
+		startLH = tr->likelihood;
+
+		/* save the currently assigned protein model for each PLL_AUTO partition */
+		for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+			oldIndex[partitionIndex] = pr->partitionData[partitionIndex]->autoProtModels;
+			oldFreqs[partitionIndex] = pr->partitionData[partitionIndex]->protUseEmpiricalFreqs;
+			bestIndex[partitionIndex] = -1;
+			bestScores[partitionIndex] = PLL_UNLIKELY;
+		}
+
+		/* evaluate all models with fixed base frequencies */
+		optimizeProteinModels(tr, pr, bestIndex, bestScores, PLL_FALSE);
+		/* evaluate all models with fixed empirical frequencies */
+		optimizeProteinModels(tr, pr, bestIndexEmpFreqs, bestScoresEmpFreqs, PLL_TRUE);
+
+		/* model selection */
+		for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+			if (pr->partitionData[partitionIndex]->protModels == PLL_AUTO) {
+				int bestIndexFixed = bestIndex[partitionIndex],
+				    bestIndexEmp = bestIndexEmpFreqs[partitionIndex];
+
+				double bestLhFixed = bestScores[partitionIndex],
+					   bestLhEmp = bestScoresEmpFreqs[partitionIndex],
+					   samples = 0.0,
+					   freeParamsFixed = 0.0,
+					   freeParamsEmp = 0.0;
+
+				samples = pr->partitionData[partitionIndex]->partitionWeight;
+				assert(samples > 0.0 && samples >= pr->partitionData[partitionIndex]->width);
+
+				assert(tr->ntips == tr->mxtips);
+				freeParamsFixed = freeParamsEmp = (2 * tr->ntips - 3);
+				freeParamsEmp += 19.0;
+
+				switch (tr->rateHetModel) {
+				case PLL_CAT:
+					freeParamsFixed +=
+							(double) pr->partitionData[partitionIndex]->numberOfCategories;
+					freeParamsEmp +=
+							(double) pr->partitionData[partitionIndex]->numberOfCategories;
+					break;
+				case PLL_GAMMA:
+					freeParamsFixed += 1.0;
+					freeParamsEmp += 1.0;
+					break;
+				default:
+					assert(0);
+				}
+
+				switch (tr->autoProteinSelectionType) {
+				case PLL_AUTO_ML:
+					if (bestLhFixed > bestLhEmp) {
+						pr->partitionData[partitionIndex]->autoProtModels =
+								bestIndexFixed;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 0;
+					} else {
+						pr->partitionData[partitionIndex]->autoProtModels = bestIndexEmp;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 1;
+					}
+					break;
+				case PLL_AUTO_BIC: {
+					//BIC: -2 * lnL + k * ln(n)
+					double bicFixed = -2.0 * bestLhFixed
+							+ freeParamsFixed * log(samples),
+						   bicEmp = -2.0
+							* bestLhEmp + freeParamsEmp * log(samples);
+
+					if (bicFixed < bicEmp) {
+						pr->partitionData[partitionIndex]->autoProtModels =
+								bestIndexFixed;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 0;
+					} else {
+						pr->partitionData[partitionIndex]->autoProtModels = bestIndexEmp;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 1;
+					}
+				}
+					break;
+				case PLL_AUTO_AIC: {
+					//AIC: 2 * (k - lnL)
+					double aicFixed = 2.0 * (freeParamsFixed - bestLhFixed),
+							aicEmp = 2.0 * (freeParamsEmp - bestLhEmp);
+
+					if (aicFixed < aicEmp) {
+						pr->partitionData[partitionIndex]->autoProtModels =
+								bestIndexFixed;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 0;
+					} else {
+						pr->partitionData[partitionIndex]->autoProtModels = bestIndexEmp;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 1;
+					}
+				}
+					break;
+				case PLL_AUTO_AICC: {
+					//AICc: AIC + (2 * k * (k + 1))/(n - k - 1)
+					double aiccFixed, aiccEmp;
+
+					/*
+					 * Even though samples and freeParamsFixed are fp variables, they are actually integers.
+					 * That's why we are comparing with a 0.5 threshold.
+					 */
+
+					if (fabs(samples - freeParamsFixed - 1.0) < 0.5)
+						aiccFixed = 0.0;
+					else
+						aiccFixed = (2.0 * (freeParamsFixed - bestLhFixed))
+								+ ((2.0 * freeParamsFixed
+										* (freeParamsFixed + 1.0))
+										/ (samples - freeParamsFixed - 1.0));
+
+					if (fabs(samples - freeParamsEmp - 1.0) < 0.5)
+						aiccEmp = 0.0;
+					else
+						aiccEmp = (2.0 * (freeParamsEmp - bestLhEmp))
+								+ ((2.0 * freeParamsEmp * (freeParamsEmp + 1.0))
+										/ (samples - freeParamsEmp - 1.0));
+
+					if (aiccFixed < aiccEmp) {
+						pr->partitionData[partitionIndex]->autoProtModels =
+								bestIndexFixed;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 0;
+					} else {
+						pr->partitionData[partitionIndex]->autoProtModels = bestIndexEmp;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 1;
+					}
+				}
+					break;
+				default:
+					assert(0);
+				}
+
+				pllInitReversibleGTR(tr, pr, partitionIndex);
+			}
+		}
+
+		resetBranches(tr);
+		pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+		pllOptimizeBranchLengths(tr, pr, 64);
+
+		/* set the protein model of PLL_AUTO partitions to the best computed and reset model parameters */
+		for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+			if (pr->partitionData[partitionIndex]->protModels == PLL_AUTO) {
+				pr->partitionData[partitionIndex]->autoProtModels = bestIndex[partitionIndex];
+				pllInitReversibleGTR(tr, pr, partitionIndex);
+			}
+		}
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+		pllMasterBarrier(tr, pr, PLL_THREAD_COPY_RATES);
+#endif
+
+		/* compute again the likelihood of the tree */
+		resetBranches(tr);
+		pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+		pllOptimizeBranchLengths(tr, pr, 64);
+
+		/* check if the likelihood of the tree with the new protein models assigned to PLL_AUTO partitions is better than the with the old protein models */
+		if (tr->likelihood < startLH) {
+			for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+				if (pr->partitionData[partitionIndex]->protModels == PLL_AUTO) {
+					pr->partitionData[partitionIndex]->autoProtModels = oldIndex[partitionIndex];
+					pllInitReversibleGTR(tr, pr, partitionIndex);
+				}
+			}
+
+			//this barrier needs to be called in the library
+			//#ifdef _USE_PTHREADS
+			//pllMasterBarrier(tr, pr, PLL_THREAD_COPY_RATES);
+			//#endif
+
+			/* Restore the topology. rl holds the topology before the optimization. However,
+			 since the topology doesn't change - only the branch lengths do - maybe we
+			 could write a new routine that will store only the branch lengths and restore them */
+			restoreTL(rl, tr, 0,
+					pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+			pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+		}
+
+		assert(tr->likelihood >= startLH);
+
+		freeTL(rl);
+		rax_free(rl);
+
+		rax_free(oldIndex);
+		rax_free(bestIndex);
+		rax_free(bestIndexEmpFreqs);
+		rax_free(bestScores);
+		rax_free(bestScoresEmpFreqs);
+	}
+}
+
+
+/* iterative procedure for optimizing all model parameters */
+
+/* @brief Optimize all model parameters
+ *
+ * Iterative procedure for optimizing all model parameters
+ *
+ * @param tr
+ *   PLL instance
+ *
+ * @param pr
+ *   List of partitions
+ *
+ * @param likelihoodEpsilon
+ *   Optimize model parameters until we get a difference of \a likelihoodEpsilon
+ *
+ * @todo
+ *   Describe likelihoodEpsilon. Understand the TODO marked blocks.
+ */
+void modOpt(pllInstance *tr, partitionList *pr, double likelihoodEpsilon)
+{ 
+  int catOpt = 0; 
+  double 
+    inputLikelihood,
+    currentLikelihood,
+    modelEpsilon = 0.0001;
+
+  /* linkage lists for alpha, p-invar has actually been ommitted in this version of the code 
+     and the GTR subst matrices */
+
+  linkageList
+    *alphaList = pr->alphaList,
+    *rateList  = pr->rateList,
+    *freqList  = pr->freqList;
+
+  modelEpsilon = 0.0001;
+
+  // test code for library
+  if (0)
+   {
+     
+      //assuming that we have three partitions for testing here 
+
+      //alphaList = initLinkageListString("0,1,2", pr);
+      //rateList  = initLinkageListString("0,1,1", pr);
+    
+      //init_Q_MatrixSymmetries("0,1,2,3,4,5", pr, 0);
+      //init_Q_MatrixSymmetries("0,1,2,3,4,4", pr, 1);
+      //init_Q_MatrixSymmetries("0,1,1,2,3,4", pr, 2);
+      
+      //function that checks that partitions that have linked Q matrices as in our example above
+      //will not have different configurations of the Q matrix as set by the init_Q_MatrixSymmetries() function
+      //e.g., on would have HKY and one would have GTR, while the user claimes that they are linked
+      //in our example, the Q matrices of partitions 1 and 2 are linked 
+      //but we set different matrix symmetries via 
+      // init_Q_MatrixSymmetries("0,1,2,3,4,4", tr, 1);
+      // and
+      // init_Q_MatrixSymmetries("0,1,1,2,3,4", tr, 2);
+      //
+      //the function just let's assertions fail for the time being .....
+
+      //checkMatrixSymnmetriesAndLinkage(pr, rateList);
+
+  /* alpha parameters and p-invar parameters are unlinked.
+     this is the point where I actually hard-coded this in RAxML */
+
+  /* call the dedicated function for linking the GTR matrix across all AA data partitions 
+     If we have only DNA data all GTR matrix estimates will be unlinked.
+     */
+   }
+  else
+   {
+     //alphaList = initLinkageList(unlinked, pr);
+     //freqList  = initLinkageList(unlinked, pr);
+     //rateList  = initLinkageListGTR(pr);
+   }
+
+  tr->start = tr->nodep[1];
+
+  /* This check is here to make sure that the likelihood 
+     computed prior to entering modOpt() is consistent 
+     with the likelihood when entering modOpt().
+     This allows us to ensure that we didn't forget to update anything prior 
+     to entereing this function.
+   */
+  inputLikelihood = tr->likelihood;
+  pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+  assert (inputLikelihood == tr->likelihood);
+
+  do
+  {           
+    //printBothOpen("cur LH: %f\n", tr->likelihood);
+    currentLikelihood = tr->likelihood;     
+
+#ifdef _DEBUG_MOD_OPT
+      printf ("start: %f\n", currentLikelihood);
+#endif
+
+    pllOptRatesGeneric(tr, pr, modelEpsilon, rateList);
+
+    pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+#ifdef _DEBUG_MOD_OPT
+    printf ("after rates %f\n", tr->likelihood);
+#endif
+
+    autoProtein(tr, pr);
+
+    pllOptimizeBranchLengths(tr, pr, 2); // 0.0625 * 32 = 2.0
+
+#ifdef _DEBUG_MOD_OPT
+    pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+    printf("after br-len 1 %f\n", tr->likelihood); 
+#endif
+
+    pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+    pllOptBaseFreqs(tr, pr, modelEpsilon, freqList);
+    
+    pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+    
+    pllOptimizeBranchLengths(tr, pr, 2); // 0.0625 * 32 = 2.0
+
+#ifdef _DEBUG_MOD_OPT
+    pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE); 
+    printf("after pllOptBaseFreqs 1 %f\n", tr->likelihood);
+#endif 
+
+    switch(tr->rateHetModel)
+    {
+      case PLL_GAMMA:      
+        pllOptAlphasGeneric (tr, pr, modelEpsilon, alphaList);
+        pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+#ifdef _DEBUG_MOD_OPT
+          printf("after alphas %f\n", tr->likelihood); 
+#endif
+
+        pllOptimizeBranchLengths(tr, pr, 3); // 0.1 * 32 = 3.2
+
+#ifdef _DEBUG_MOD_OPT
+          pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);  
+          printf("after br-len 2 %f\n", tr->likelihood); 
+#endif
+        break;
+      case PLL_CAT:
+        if(catOpt < 3)
+        {                            
+          pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);  
+          optimizeRateCategories(tr, pr, tr->categories);
+#ifdef _DEBUG_MOD_OPT
+            pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);  
+            printf("after cat-opt %f\n", tr->likelihood); 
+#endif
+          catOpt++;
+        }
+        break;    
+      default:
+        assert(0);
+    }                   
+
+    if(tr->likelihood < currentLikelihood)
+     {
+      printf("%.20f %.20f\n", tr->likelihood, currentLikelihood);
+      printf("Difference: %.20f\n",tr->likelihood - currentLikelihood);
+    }
+    assert (tr->likelihood - currentLikelihood > 0.000000000000001);
+    //assert(tr->likelihood > currentLikelihood);
+
+  }
+  while(fabs(currentLikelihood - tr->likelihood) > likelihoodEpsilon);  
+  /* TODO: Why do we check the computed likelihood with the currentLikelihood which is the likelihood before THIS optimization loop? Why dont we
+     rather check it with the initial likelihood (the one before calling modOpt)? Isn't it possible to have a deadlock? */
+
+  
+}
+
diff --git a/pll/parsePartition.c b/pll/parsePartition.c
new file mode 100644
index 0000000..1ae92af
--- /dev/null
+++ b/pll/parsePartition.c
@@ -0,0 +1,388 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file parsePartition.c
+ * @brief Collection of routines for parsing and processing a partition (model) file
+ *
+ * @defgroup parsePartitionFileGroup Reading and parsing partition (model) files
+ * This set of functions handles the reading and parsing of partition files, i.e.
+ * files that contain alignment partition definitions and corresponding models.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <ctype.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+extern const char *protModels[PLL_NUM_PROT_MODELS];
+
+static void destroy_model_names(pllHashTable * hashTable)
+{
+  pllHashDestroy (&hashTable, rax_free);
+}
+
+static pllHashTable * init_model_names (void)
+{
+  int i;
+  int * item;
+
+  pllHashTable * hashTable;
+  hashTable = pllHashInit (PLL_NUM_PROT_MODELS);
+
+  for (i = 0; i < PLL_NUM_PROT_MODELS; ++ i)
+   {
+     item  = (int *) rax_malloc (sizeof (int));
+     *item = i;
+     pllHashAdd (hashTable, pllHashString(protModels[i], hashTable->size), protModels[i], (void *) item);
+   }
+  return hashTable;
+}
+
+/** @ingroup parsePartitionFileGroup
+    @brief Destroy queue structure that contains parsed information from a partition file
+
+    Destroys the structure, and therefore frees allocated memory, that holds parsed information
+    from a partition (model) file
+
+    @param partitions
+      Queue structure with parsed info
+*/
+void pllQueuePartitionsDestroy (pllQueue ** partitions)
+{
+  pllPartitionInfo * pi;
+  pllPartitionRegion * region;
+
+  while (pllQueueRemove (*partitions, (void **)&pi))
+   {
+     while (pllQueueRemove (pi->regionList, (void **) &region))
+      {
+        rax_free (region);
+      }
+     rax_free (pi->regionList);
+     rax_free (pi->partitionName);
+     rax_free (pi->partitionModel);
+     rax_free (pi);
+   }
+  rax_free (*partitions);
+}
+
+static pllQueue * parse_partition (int * inp, pllHashTable * proteinModelsHash)
+{
+  int input, i;
+  pllLexToken token;
+  int lines = 0;
+  pllQueue * partitions;
+  pllPartitionInfo * pi;
+  pllPartitionRegion * region;
+  int * protIndexPtr;
+  char * modelptr;
+
+  input  = *inp;
+
+  NEXT_TOKEN
+
+  pllQueueInit (&partitions);
+  while (token.tokenType != PLL_TOKEN_EOF)
+  {
+    ++ lines;
+    pi = (pllPartitionInfo *) rax_calloc (1, sizeof (pllPartitionInfo));
+    pllQueueInit (&(pi->regionList));
+    pllQueueAppend (partitions, (void *)pi);
+    CONSUME (PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+
+
+    /* read partition type */
+    if (token.tokenType != PLL_TOKEN_STRING) 
+     {
+       pllQueuePartitionsDestroy (&partitions);
+       return (0);
+     }
+    pi->partitionModel = my_strndup (token.lexeme, token.len);
+    for (i = 0; i < token.len; ++i) pi->partitionModel[i] = toupper(pi->partitionModel[i]);
+
+    // check partition model
+    pi->protModels              = -1;
+    pi->protUseEmpiricalFreqs   = PLL_FALSE;
+    pi->ascBias                 = PLL_FALSE;
+    pi->optimizeBaseFrequencies = PLL_FALSE;
+
+    /* check if the model contains Asc bias */
+    if (!strncmp(pi->partitionModel, "ASC_", 4))
+      {
+        pi->ascBias = PLL_TRUE;
+        modelptr    = pi->partitionModel + 4;
+      }
+     else
+        modelptr    = pi->partitionModel;
+
+    /* check first for BINARY */
+    if (!strcmp(modelptr, "BIN") || !strcmp(modelptr, "BINX"))
+     {
+       pi->dataType = PLL_BINARY_DATA;
+
+       if (!strcmp(modelptr, "BINX"))
+         pi->optimizeBaseFrequencies = PLL_TRUE;
+     }  /* now for DNA */
+    else if (!strcmp(modelptr, "DNA") || !strcmp(modelptr, "DNAX"))
+     {
+       pi->dataType   = PLL_DNA_DATA;
+
+       if (!strcmp(modelptr, "DNAX")) 
+         pi->optimizeBaseFrequencies = PLL_TRUE; 
+     }
+    else
+     {                  /* and  protein data */
+       pi->dataType  = PLL_AA_DATA;
+
+       if (pllHashSearch (proteinModelsHash, modelptr, (void **) &protIndexPtr))
+        {
+          pi->protModels              = *protIndexPtr;
+          pi->protUseEmpiricalFreqs   = PLL_FALSE;
+          pi->optimizeBaseFrequencies = PLL_FALSE;
+        }
+       else
+        {
+          if (modelptr[token.len - 1] == 'X')
+           {
+             modelptr[token.len - 1] = '\0';
+             if (pllHashSearch (proteinModelsHash, modelptr, (void **) &protIndexPtr))
+              {
+                pi->protModels              = *protIndexPtr;
+                pi->optimizeBaseFrequencies = PLL_TRUE;
+              }
+             modelptr[token.len - 1] = 'X';
+           }
+          else if (modelptr[token.len - 1] == 'F')
+           {
+             modelptr[token.len - 1] = '\0';
+             if (pllHashSearch (proteinModelsHash, modelptr, (void **) &protIndexPtr))
+              {
+                pi->protModels              = *protIndexPtr;
+                pi->protUseEmpiricalFreqs   = PLL_TRUE;
+              }
+             modelptr[token.len - 1] = 'F';
+           }
+          else
+           {
+             pllQueuePartitionsDestroy (&partitions);
+             return (0);
+           }
+        }
+     }
+
+    NEXT_TOKEN
+    CONSUME(PLL_TOKEN_WHITESPACE)
+
+    if (token.tokenType != PLL_TOKEN_COMMA) 
+     {
+       pllQueuePartitionsDestroy (&partitions);
+       return (0);
+     }
+    NEXT_TOKEN
+    CONSUME(PLL_TOKEN_WHITESPACE)
+
+    /* read partition name */
+    if (token.tokenType != PLL_TOKEN_STRING) 
+     {
+       pllQueuePartitionsDestroy (&partitions);
+       return (0);
+     }
+    pi->partitionName = my_strndup (token.lexeme, token.len);
+
+    NEXT_TOKEN
+    CONSUME(PLL_TOKEN_WHITESPACE)
+
+    /* read equal sign */
+    if (token.tokenType != PLL_TOKEN_EQUAL)
+     {
+       pllQueuePartitionsDestroy (&partitions);
+       return (0);
+     }
+    NEXT_TOKEN
+    CONSUME(PLL_TOKEN_WHITESPACE)
+
+    /* read rhs */
+    while (1)
+    {
+      region = (pllPartitionRegion *) rax_malloc (sizeof (pllPartitionRegion));
+      if (token.tokenType != PLL_TOKEN_NUMBER) 
+       {
+         pllQueuePartitionsDestroy (&partitions);
+         return (0);
+       }
+      region->start  = region->end = atoi (token.lexeme);  
+      region->stride = 1;
+      NEXT_TOKEN
+      CONSUME(PLL_TOKEN_WHITESPACE)
+      
+      if  (token.tokenType == PLL_TOKEN_DASH)
+       {
+         NEXT_TOKEN
+         CONSUME(PLL_TOKEN_WHITESPACE)
+         if (token.tokenType != PLL_TOKEN_NUMBER) 
+          {
+            pllQueuePartitionsDestroy (&partitions);
+            return (0);
+          }
+         region->end = atoi (token.lexeme);
+         if (region->end < region->start)
+          {
+            pllQueuePartitionsDestroy (&partitions);
+            return (0);
+          }
+         NEXT_TOKEN
+         CONSUME(PLL_TOKEN_WHITESPACE)
+         if (token.tokenType == PLL_TOKEN_SLASH)
+          {
+            NEXT_TOKEN
+            CONSUME(PLL_TOKEN_WHITESPACE)
+            if (token.tokenType != PLL_TOKEN_NUMBER) 
+             {
+               pllQueuePartitionsDestroy (&partitions);
+               return (0);
+             }
+            region->stride = atoi (token.lexeme);
+            NEXT_TOKEN
+          }
+         CONSUME(PLL_TOKEN_WHITESPACE)
+       }
+       pllQueueAppend (pi->regionList, (void *)region);
+      
+      if (token.tokenType != PLL_TOKEN_COMMA) break;
+      NEXT_TOKEN
+      CONSUME(PLL_TOKEN_WHITESPACE)
+    }
+   CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+  }
+ 
+ return (partitions);
+} 
+
+/** @ingroup parsePartitionFileGroup
+    @brief Dump a parsed partition file in the console
+
+    Prints the parsed contents of a partition file to the console
+
+    @param partitions Queue structure containing parsed information
+*/
+void pllPartitionDump (pllQueue * partitions)
+{
+   struct pllQueueItem * elm;
+   struct pllQueueItem * regionList;
+   pllPartitionInfo * pi;
+   pllPartitionRegion * region;
+
+   elm = partitions->head;
+
+   while (elm)
+    {
+      pi  = (pllPartitionInfo *) elm->item;
+      printf ("%s, %s = ", pi->partitionModel, pi->partitionName);
+      regionList = pi->regionList->head;
+      while (regionList)
+       {
+         region = (pllPartitionRegion *) regionList->item;
+         printf ("%d", region->start);
+         if (region->start != region->end)
+          {
+            printf ("-%d", region->end);
+            if (region->stride != 1) printf ("/%d", region->stride);
+          }
+         regionList = regionList->next;
+         if (regionList) printf (", ");
+       }
+      printf ("\n");
+
+      elm = elm->next;
+    }
+}
+
+/** @ingroup parsePartitionFileGroup
+    @brief Parse a partition (model) file
+
+    Parses the partition file \a filename and stores the information in a queue
+    structure ::pllQueue
+
+    @param filename Name of the partition file
+    @return Queue structure with parsed information
+*/
+pllQueue * pllPartitionParse (const char * filename)
+{
+  long n;
+  char * rawdata;
+  int input;
+  pllQueue * partitions;
+
+  rawdata = pllReadFile (filename, &n);
+  if (!rawdata)
+   {
+     fprintf (stderr, "Error while opening/reading file %s\n", filename);
+     return (0);
+   }
+
+  n = strlen (rawdata);
+
+  init_lexan (rawdata, n);
+  input = get_next_symbol();
+
+  pllHashTable * model_names = init_model_names();
+  partitions  = parse_partition (&input, model_names);
+  destroy_model_names(model_names);
+  
+  rax_free (rawdata);
+  return (partitions);
+}
+
+/** @ingroup parsePartitionFileGroup
+    @brief Parse a partition (model) file
+
+    Parses the partition information stored in string \a p and stores the
+    information in a queue structure ::pllQueue
+
+    @param p Partition information string
+    @return  Queue structure with parsed information
+*/
+pllQueue * pllPartitionParseString (const char * p)
+{
+  long n;
+  int input;
+  pllQueue * partitions;
+
+  n = strlen(p);
+  init_lexan (p, n);
+  input = get_next_symbol();
+
+  pllHashTable * model_names;
+  model_names = init_model_names();
+  partitions = parse_partition (&input, model_names);
+  destroy_model_names(model_names);
+  
+  return (partitions);
+}
diff --git a/pll/parsePartition.h b/pll/parsePartition.h
new file mode 100644
index 0000000..47799d9
--- /dev/null
+++ b/pll/parsePartition.h
@@ -0,0 +1,51 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file part.h
+ */
+#ifndef __pll_PART__
+#define __pll_PART__
+#include "queue.h"
+
+typedef struct
+{
+  int start;
+  int end;
+  int stride;
+} pllPartitionRegion;
+
+typedef struct 
+{
+  char * partitionName;
+  char * partitionModel;
+  int protModels;
+  int protUseEmpiricalFreqs;
+  int dataType;
+  int ascBias;
+  int optimizeBaseFrequencies;
+  pllQueue * regionList;
+} pllPartitionInfo;
+#endif
diff --git a/pll/parsimony.c b/pll/parsimony.c
new file mode 100644
index 0000000..1fae471
--- /dev/null
+++ b/pll/parsimony.c
@@ -0,0 +1,865 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file parsimony.c
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h>  
+#endif
+
+#include <limits.h>
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include <assert.h>
+
+#if defined(__MIC_NATIVE)
+
+#include <immintrin.h>
+
+#define INTS_PER_VECTOR 16
+#define LONG_INTS_PER_VECTOR 8
+#define INT_TYPE __m512i
+#define CAST double*
+#define SET_ALL_BITS_ONE _mm512_set1_epi32(0xFFFFFFFF)
+#define SET_ALL_BITS_ZERO _mm512_setzero_epi32()
+#define VECTOR_LOAD _mm512_load_epi32
+#define VECTOR_STORE  _mm512_store_epi32
+#define VECTOR_BIT_AND _mm512_and_epi32
+#define VECTOR_BIT_OR  _mm512_or_epi32
+#define VECTOR_AND_NOT _mm512_andnot_epi32
+
+#elif defined(__AVX)
+
+#include <xmmintrin.h>
+#include <immintrin.h>
+#include <pmmintrin.h>
+
+#define ULINT_SIZE 64
+#define INTS_PER_VECTOR 8
+#define LONG_INTS_PER_VECTOR 4
+#define INT_TYPE __m256d
+#define CAST double*
+#define SET_ALL_BITS_ONE (__m256d)_mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
+#define SET_ALL_BITS_ZERO (__m256d)_mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)
+#define VECTOR_LOAD _mm256_load_pd
+#define VECTOR_BIT_AND _mm256_and_pd
+#define VECTOR_BIT_OR  _mm256_or_pd
+#define VECTOR_STORE  _mm256_store_pd
+#define VECTOR_AND_NOT _mm256_andnot_pd
+
+#elif (defined(__SSE3))
+
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+  
+#define INTS_PER_VECTOR 4
+#ifdef __i386__
+#define ULINT_SIZE 32
+#define LONG_INTS_PER_VECTOR 4
+#else
+#define ULINT_SIZE 64
+#define LONG_INTS_PER_VECTOR 2
+#endif
+#define INT_TYPE __m128i
+#define CAST __m128i*
+#define SET_ALL_BITS_ONE _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
+#define SET_ALL_BITS_ZERO _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000)
+#define VECTOR_LOAD _mm_load_si128
+#define VECTOR_BIT_AND _mm_and_si128
+#define VECTOR_BIT_OR  _mm_or_si128
+#define VECTOR_STORE  _mm_store_si128
+#define VECTOR_AND_NOT _mm_andnot_si128
+
+#endif
+
+#include "pll.h"
+#include "pllInternal.h"
+
+extern const unsigned int mask32[32]; 
+
+static __inline unsigned int vectorPopcount(INT_TYPE v)
+{
+  unsigned long
+    counts[LONG_INTS_PER_VECTOR] __attribute__ ((aligned (PLL_BYTE_ALIGNMENT)));
+
+  int    
+    i,
+    sum = 0;
+
+  VECTOR_STORE((CAST)counts, v);
+
+  for(i = 0; i < LONG_INTS_PER_VECTOR; i++)
+     sum += __builtin_popcountl(counts[i]);
+
+  return ((unsigned int)sum);
+}
+
+static __inline void storePerSiteScores (partitionList * pr, int model, INT_TYPE v, unsigned int offset)
+{
+  unsigned long
+    counts[LONG_INTS_PER_VECTOR] __attribute__ ((aligned (PLL_BYTE_ALIGNMENT)));
+  parsimonyNumber * buf;
+
+  int    
+    i,
+    j;
+  
+  VECTOR_STORE((CAST)counts, v);
+
+  for (i = 0; i < LONG_INTS_PER_VECTOR; ++i)
+   {
+     buf = &(pr->partitionData[model]->perSiteParsScores[offset * PLL_PCF + i * ULINT_SIZE]);
+     for (j = 0; j < ULINT_SIZE; ++ j)
+        buf[j] += ((counts[i] >> j) & 1);
+   }
+  
+}
+
+static void getxnodeLocal (nodeptr p)
+{
+  nodeptr  s;
+
+  if((s = p->next)->xPars || (s = s->next)->xPars)
+    {
+      p->xPars = s->xPars;
+      s->xPars = 0;
+    }
+
+  assert(p->next->xPars || p->next->next->xPars || p->xPars);
+
+}
+
+static void computeTraversalInfoParsimony(nodeptr p, int *ti, int *counter, int maxTips, pllBoolean full)
+{        
+  nodeptr 
+    q = p->next->back,
+    r = p->next->next->back;
+  
+  if(! p->xPars)
+    getxnodeLocal(p);  
+  
+  if(full)
+    {
+       if(q->number > maxTips) 
+         computeTraversalInfoParsimony(q, ti, counter, maxTips, full);
+      
+      if(r->number > maxTips) 
+        computeTraversalInfoParsimony(r, ti, counter, maxTips, full);
+    }
+  else
+    {
+      if(q->number > maxTips && !q->xPars) 
+        computeTraversalInfoParsimony(q, ti, counter, maxTips, full);
+      
+      if(r->number > maxTips && !r->xPars) 
+        computeTraversalInfoParsimony(r, ti, counter, maxTips, full);
+    }
+  
+  
+  ti[*counter]     = p->number;
+  ti[*counter + 1] = q->number;
+  ti[*counter + 2] = r->number;
+  *counter = *counter + 4;
+}
+
+/* check whether site contains at least 2 different letters, i.e.
+   whether it will generate a score */
+static pllBoolean isInformative(pllInstance *tr, int dataType, int site)
+{
+  int
+    informativeCounter = 0,
+    check[256],   
+    j,   
+    undetermined = getUndetermined(dataType);
+
+  const unsigned int
+    *bitVector = getBitVector(dataType);
+
+  unsigned char
+    nucleotide;
+  
+        
+  for(j = 0; j < 256; j++)
+    check[j] = 0;
+  
+  for(j = 1; j <= tr->mxtips; j++)
+    {      
+      nucleotide = tr->yVector[j][site];            
+      check[nucleotide] = 1;
+      assert(bitVector[nucleotide] > 0);                   
+    }
+  
+  for(j = 0; j < undetermined; j++)
+    {
+      if(check[j] > 0)
+        informativeCounter++;    
+    } 
+          
+  if(informativeCounter > 1)
+    return PLL_TRUE;    
+
+  return PLL_FALSE;          
+}
+
+static void compressDNA(pllInstance *tr, partitionList *pr, int *informative, int perSiteScores)
+{
+  size_t
+    totalNodes,
+    i,
+    model;
+   
+  totalNodes = 2 * (size_t)tr->mxtips;
+
+ 
+
+  for(model = 0; model < (size_t) pr->numberOfPartitions; model++)
+    {
+      size_t
+        k,
+        states = (size_t)pr->partitionData[model]->states,
+        compressedEntries,
+        compressedEntriesPadded,
+        entries = 0, 
+        lower = pr->partitionData[model]->lower,
+        upper = pr->partitionData[model]->upper;
+
+      parsimonyNumber 
+        **compressedTips = (parsimonyNumber **)rax_malloc(states * sizeof(parsimonyNumber*)),
+        *compressedValues = (parsimonyNumber *)rax_malloc(states * sizeof(parsimonyNumber));
+      
+      for(i = lower; i < upper; i++)    
+        if(informative[i])
+          entries += (size_t)tr->aliaswgt[i];     
+  
+      compressedEntries = entries / PLL_PCF;
+
+      if(entries % PLL_PCF != 0)
+        compressedEntries++;
+
+#if (defined(__SSE3) || defined(__AVX))
+      if(compressedEntries % INTS_PER_VECTOR != 0)
+        compressedEntriesPadded = compressedEntries + (INTS_PER_VECTOR - (compressedEntries % INTS_PER_VECTOR));
+      else
+        compressedEntriesPadded = compressedEntries;
+#else
+      compressedEntriesPadded = compressedEntries;
+#endif     
+
+      
+      rax_posix_memalign ((void **) &(pr->partitionData[model]->parsVect), PLL_BYTE_ALIGNMENT, (size_t)compressedEntriesPadded * states * totalNodes * sizeof(parsimonyNumber));
+      if (perSiteScores)
+       {
+         rax_posix_memalign ((void **) &(pr->partitionData[model]->perSiteParsScores), PLL_BYTE_ALIGNMENT, (size_t)pr->partitionData[model]->width* sizeof (parsimonyNumber));
+         for (i = 0; i < (size_t)pr->partitionData[model]->width; ++i) pr->partitionData[model]->perSiteParsScores[i] = 0;
+       }
+
+     
+      for(i = 0; i < compressedEntriesPadded * states * totalNodes; i++)      
+        pr->partitionData[model]->parsVect[i] = 0;
+
+      for(i = 0; i < (size_t)tr->mxtips; i++)
+        {
+          size_t
+            w = 0,
+            compressedIndex = 0,
+            compressedCounter = 0,
+            index = 0;
+
+          for(k = 0; k < states; k++)
+            {
+              compressedTips[k] = &(pr->partitionData[model]->parsVect[(compressedEntriesPadded * states * (i + 1)) + (compressedEntriesPadded * k)]);
+              compressedValues[k] = 0;
+            }                
+              
+          for(index = lower; index < (size_t)upper; index++)
+            {
+              if(informative[index])
+                {
+                  const unsigned int 
+                    *bitValue = getBitVector(pr->partitionData[model]->dataType);
+
+                  parsimonyNumber 
+                    value = bitValue[tr->yVector[i + 1][index]];          
+              
+                  for(w = 0; w < (size_t)tr->aliaswgt[index]; w++)
+                    {      
+                      for(k = 0; k < states; k++)
+                        {
+                          if(value & mask32[k])
+                            compressedValues[k] |= mask32[compressedCounter];
+                        }
+                     
+                      compressedCounter++;
+                  
+                      if(compressedCounter == PLL_PCF)
+                        {
+                          for(k = 0; k < states; k++)
+                            {
+                              compressedTips[k][compressedIndex] = compressedValues[k];
+                              compressedValues[k] = 0;
+                            }                    
+                          
+                          compressedCounter = 0;
+                          compressedIndex++;
+                        }
+                    }
+                }
+            }
+          
+          for(;compressedIndex < compressedEntriesPadded; compressedIndex++)
+            {   
+              for(;compressedCounter < PLL_PCF; compressedCounter++)              
+                for(k = 0; k < states; k++)
+                  compressedValues[k] |= mask32[compressedCounter];               
+          
+              for(k = 0; k < states; k++)
+                {
+                  compressedTips[k][compressedIndex] = compressedValues[k];
+                  compressedValues[k] = 0;
+                }                     
+              
+              compressedCounter = 0;
+            }           
+        }
+  
+      pr->partitionData[model]->parsimonyLength = compressedEntriesPadded;
+
+      rax_free(compressedTips);
+      rax_free(compressedValues);
+    }
+  
+  rax_posix_memalign ((void **) &(tr->parsimonyScore), PLL_BYTE_ALIGNMENT, sizeof(unsigned int) * totalNodes);  
+          
+  for(i = 0; i < totalNodes; i++) 
+    tr->parsimonyScore[i] = 0;
+}
+
+static void determineUninformativeSites(pllInstance *tr, partitionList *pr, int *informative)
+{
+  int 
+    model,
+    number = 0,
+    i;
+
+  /* 
+     Not all characters are useful in constructing a parsimony tree. 
+     Invariant characters, those that have the same state in all taxa, 
+     are obviously useless and are ignored by the method. Characters in 
+     which a state occurs in only one taxon are also ignored. 
+     All these characters are called parsimony uninformative.
+
+     Alternative definition: informative columns contain at least two types
+     of nucleotides, and each nucleotide must appear at least twice in each 
+     column. Kind of a pain if we intend to check for this when using, e.g.,
+     amibiguous DNA encoding.
+  */
+
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      for(i = pr->partitionData[model]->lower; i < pr->partitionData[model]->upper; i++)
+        {
+           if(isInformative(tr, pr->partitionData[model]->dataType, i))
+             informative[i] = 1;
+           else
+             {
+               informative[i] = 0;
+               number++;
+             }  
+        }      
+    }
+
+  /* printf("Uninformative Patterns: %d\n", number); */
+}
+
+void pllInitParsimonyStructures(pllInstance *tr, partitionList *pr, pllBoolean perSiteScores)
+{
+  int 
+    i,
+    *informative = (int *)rax_malloc(sizeof(int) * (size_t)tr->originalCrunchedLength);
+
+  for (i = 0; i < pr->numberOfPartitions; ++ i)
+     rax_free (pr->partitionData[i]->parsVect);
+
+  rax_free (tr->parsimonyScore);
+ 
+  determineUninformativeSites(tr, pr, informative);
+
+  compressDNA(tr, pr, informative, perSiteScores);
+
+  for(i = tr->mxtips + 1; i <= tr->mxtips + tr->mxtips - 1; i++)
+    {
+      nodeptr 
+        p = tr->nodep[i];
+
+      p->xPars             = 1;
+      p->next->xPars       = 0;
+      p->next->next->xPars = 0;
+    }
+
+  tr->ti = (int*)rax_malloc(sizeof(int) * 4 * (size_t)tr->mxtips);  
+
+  rax_free(informative); 
+}
+
+static void newviewParsimonyIterativeFast(pllInstance *tr, partitionList *pr, pllBoolean perSiteScores)
+{    
+  INT_TYPE
+    allOne = SET_ALL_BITS_ONE;
+
+  int 
+    model,
+    *ti = tr->ti,
+    count = ti[0],
+    index; 
+
+  for(index = 4; index < count; index += 4)
+    {      
+      unsigned int
+        totalScore = 0;
+
+      size_t
+        pNumber = (size_t)ti[index],
+        qNumber = (size_t)ti[index + 1],
+        rNumber = (size_t)ti[index + 2];
+      
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        {
+          size_t
+            k,
+            states = pr->partitionData[model]->states,
+            width = pr->partitionData[model]->parsimonyLength;
+            
+          unsigned int  
+            i;      
+                 
+          switch(states)
+            {
+            case 2:       
+              {
+                parsimonyNumber
+                  *left[2],
+                  *right[2],
+                  *this[2];
+
+                for(k = 0; k < 2; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 2 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    INT_TYPE
+                      s_r, s_l, v_N,
+                      l_A, l_C,
+                      v_A, v_C;          
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[0][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[0][i]));
+                    l_A = VECTOR_BIT_AND(s_l, s_r);
+                    v_A = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[1][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[1][i]));
+                    l_C = VECTOR_BIT_AND(s_l, s_r);
+                    v_C = VECTOR_BIT_OR(s_l, s_r);                                                                
+                    
+                    v_N = VECTOR_BIT_OR(l_A, l_C);
+                    
+                    VECTOR_STORE((CAST)(&this[0][i]), VECTOR_BIT_OR(l_A, VECTOR_AND_NOT(v_N, v_A)));
+                    VECTOR_STORE((CAST)(&this[1][i]), VECTOR_BIT_OR(l_C, VECTOR_AND_NOT(v_N, v_C)));                                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);            
+                    if (perSiteScores)
+                       storePerSiteScores (pr, model, v_N, i);
+                  }
+              }
+              break;
+            case 4:
+              {
+                parsimonyNumber
+                  *left[4],
+                  *right[4],
+                  *this[4];
+
+                for(k = 0; k < 4; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 4 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * pNumber) + width * k]);
+                  }
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    INT_TYPE
+                      s_r, s_l, v_N,
+                      l_A, l_C, l_G, l_T,
+                      v_A, v_C, v_G, v_T;                
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[0][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[0][i]));
+                    l_A = VECTOR_BIT_AND(s_l, s_r);
+                    v_A = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[1][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[1][i]));
+                    l_C = VECTOR_BIT_AND(s_l, s_r);
+                    v_C = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[2][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[2][i]));
+                    l_G = VECTOR_BIT_AND(s_l, s_r);
+                    v_G = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[3][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[3][i]));
+                    l_T = VECTOR_BIT_AND(s_l, s_r);
+                    v_T = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    v_N = VECTOR_BIT_OR(VECTOR_BIT_OR(l_A, l_C), VECTOR_BIT_OR(l_G, l_T));                                
+                    
+                    VECTOR_STORE((CAST)(&this[0][i]), VECTOR_BIT_OR(l_A, VECTOR_AND_NOT(v_N, v_A)));
+                    VECTOR_STORE((CAST)(&this[1][i]), VECTOR_BIT_OR(l_C, VECTOR_AND_NOT(v_N, v_C)));
+                    VECTOR_STORE((CAST)(&this[2][i]), VECTOR_BIT_OR(l_G, VECTOR_AND_NOT(v_N, v_G)));
+                    VECTOR_STORE((CAST)(&this[3][i]), VECTOR_BIT_OR(l_T, VECTOR_AND_NOT(v_N, v_T)));                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);  
+                    
+                    if (perSiteScores)
+                       storePerSiteScores (pr, model, v_N, i);
+                  }
+              }
+              break;
+            case 20:
+              {
+                parsimonyNumber
+                  *left[20],
+                  *right[20],
+                  *this[20];
+
+                for(k = 0; k < 20; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 20 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    size_t j;
+                    
+                    INT_TYPE
+                      s_r, s_l, 
+                      v_N = SET_ALL_BITS_ZERO,
+                      l_A[20], 
+                      v_A[20];           
+                    
+                    for(j = 0; j < 20; j++)
+                      {
+                        s_l = VECTOR_LOAD((CAST)(&left[j][i]));
+                        s_r = VECTOR_LOAD((CAST)(&right[j][i]));
+                        l_A[j] = VECTOR_BIT_AND(s_l, s_r);
+                        v_A[j] = VECTOR_BIT_OR(s_l, s_r);
+                        
+                        v_N = VECTOR_BIT_OR(v_N, l_A[j]);
+                      }
+                    
+                    for(j = 0; j < 20; j++)                 
+                      VECTOR_STORE((CAST)(&this[j][i]), VECTOR_BIT_OR(l_A[j], VECTOR_AND_NOT(v_N, v_A[j])));                                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);
+
+                    if (perSiteScores)
+                       storePerSiteScores (pr, model, v_N, i);
+                  }
+              }
+              break;
+            default:
+              {
+                parsimonyNumber
+                  *left[32], 
+                  *right[32],
+                  *this[32];
+
+                assert(states <= 32);
+                
+                for(k = 0; k < states; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * states * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * states * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * states * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    size_t j;
+                    
+                    INT_TYPE
+                      s_r, s_l, 
+                      v_N = SET_ALL_BITS_ZERO,
+                      l_A[32], 
+                      v_A[32];           
+                    
+                    for(j = 0; j < states; j++)
+                      {
+                        s_l = VECTOR_LOAD((CAST)(&left[j][i]));
+                        s_r = VECTOR_LOAD((CAST)(&right[j][i]));
+                        l_A[j] = VECTOR_BIT_AND(s_l, s_r);
+                        v_A[j] = VECTOR_BIT_OR(s_l, s_r);
+                        
+                        v_N = VECTOR_BIT_OR(v_N, l_A[j]);
+                      }
+                    
+                    for(j = 0; j < states; j++)             
+                      VECTOR_STORE((CAST)(&this[j][i]), VECTOR_BIT_OR(l_A[j], VECTOR_AND_NOT(v_N, v_A[j])));                                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);
+
+                    if (perSiteScores)
+                       storePerSiteScores (pr, model, v_N, i);
+                  }                             
+              }
+            }            
+        }
+      tr->parsimonyScore[pNumber] = totalScore + tr->parsimonyScore[rNumber] + tr->parsimonyScore[qNumber];      
+    }
+}
+
+static unsigned int evaluateParsimonyIterativeFast(pllInstance *tr, partitionList *pr, pllBoolean perSiteScores)
+{
+  INT_TYPE 
+    allOne = SET_ALL_BITS_ONE;
+
+  size_t 
+    pNumber = (size_t)tr->ti[1],
+    qNumber = (size_t)tr->ti[2];
+
+  int
+    model;
+
+  unsigned int 
+    bestScore = tr->bestParsimony,    
+    sum;
+
+  if(tr->ti[0] > 4)
+    newviewParsimonyIterativeFast(tr, pr, perSiteScores);
+
+  sum = tr->parsimonyScore[pNumber] + tr->parsimonyScore[qNumber];
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      size_t
+        k,
+        states = pr->partitionData[model]->states,
+        width  = pr->partitionData[model]->parsimonyLength,
+        i;
+
+       switch(states)
+         {
+         case 2:
+           {
+             parsimonyNumber
+               *left[2],
+               *right[2];
+             
+             for(k = 0; k < 2; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * 2 * pNumber) + width * k]);
+               }     
+             
+             for(i = 0; i < width; i += INTS_PER_VECTOR)
+               {                                               
+                 INT_TYPE      
+                   l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[0][i])), VECTOR_LOAD((CAST)(&right[0][i]))),
+                   l_C = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[1][i])), VECTOR_LOAD((CAST)(&right[1][i]))),            
+                   v_N = VECTOR_BIT_OR(l_A, l_C);
+                 
+                 v_N = VECTOR_AND_NOT(v_N, allOne);
+                 
+                 sum += vectorPopcount(v_N);
+                  if (perSiteScores)
+                    storePerSiteScores (pr, model, v_N, i);
+               }
+           }
+           break;
+         case 4:
+           {
+             parsimonyNumber
+               *left[4],
+               *right[4];
+      
+             for(k = 0; k < 4; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * 4 * pNumber) + width * k]);
+               }        
+
+             for(i = 0; i < width; i += INTS_PER_VECTOR)
+               {                                                
+                 INT_TYPE      
+                   l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[0][i])), VECTOR_LOAD((CAST)(&right[0][i]))),
+                   l_C = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[1][i])), VECTOR_LOAD((CAST)(&right[1][i]))),
+                   l_G = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[2][i])), VECTOR_LOAD((CAST)(&right[2][i]))),
+                   l_T = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[3][i])), VECTOR_LOAD((CAST)(&right[3][i]))),
+                   v_N = VECTOR_BIT_OR(VECTOR_BIT_OR(l_A, l_C), VECTOR_BIT_OR(l_G, l_T));     
+                 
+                 v_N = VECTOR_AND_NOT(v_N, allOne);
+                 
+                 sum += vectorPopcount(v_N);
+                  if (perSiteScores)
+                    storePerSiteScores (pr, model, v_N, i);
+               }                 
+           }
+           break;
+         case 20:
+           {
+             parsimonyNumber
+               *left[20],
+               *right[20];
+             
+              for(k = 0; k < 20; k++)
+                {
+                  left[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * qNumber) + width * k]);
+                  right[k] = &(pr->partitionData[model]->parsVect[(width * 20 * pNumber) + width * k]);
+                }  
+           
+              for(i = 0; i < width; i += INTS_PER_VECTOR)
+                {                              
+                  int 
+                    j;
+                  
+                  INT_TYPE      
+                    l_A,
+                    v_N = SET_ALL_BITS_ZERO;     
+                  
+                  for(j = 0; j < 20; j++)
+                    {
+                      l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[j][i])), VECTOR_LOAD((CAST)(&right[j][i])));
+                      v_N = VECTOR_BIT_OR(l_A, v_N);
+                    }
+                  
+                  v_N = VECTOR_AND_NOT(v_N, allOne);
+                  
+                  sum += vectorPopcount(v_N);          
+                  if (perSiteScores)
+                    storePerSiteScores (pr, model, v_N, i);
+                }
+           }
+           break;
+         default:
+           {
+             parsimonyNumber
+               *left[32],  
+               *right[32]; 
+
+             assert(states <= 32);
+
+             for(k = 0; k < states; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * states * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * states * pNumber) + width * k]);
+               }  
+           
+             for(i = 0; i < width; i += INTS_PER_VECTOR)
+               {                               
+                 size_t
+                   j;
+                 
+                 INT_TYPE      
+                   l_A,
+                   v_N = SET_ALL_BITS_ZERO;     
+                 
+                 for(j = 0; j < states; j++)
+                   {
+                     l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[j][i])), VECTOR_LOAD((CAST)(&right[j][i])));
+                     v_N = VECTOR_BIT_OR(l_A, v_N);
+                   }
+                 
+                 v_N = VECTOR_AND_NOT(v_N, allOne);
+                 
+                 sum += vectorPopcount(v_N);           
+                 if (perSiteScores)
+                   storePerSiteScores (pr, model, v_N, i);
+               }
+           }
+         }
+    }
+  
+  return sum;
+}
+
+unsigned int pllEvaluateParsimony(pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean full, pllBoolean perSiteScores)
+{
+  volatile unsigned int result;
+  nodeptr q = p->back;
+  int
+    *ti = tr->ti,
+    counter = 4;
+  
+  ti[1] = p->number;
+  ti[2] = q->number;
+
+  if(full)
+    {
+      if(p->number > tr->mxtips)
+        computeTraversalInfoParsimony(p, ti, &counter, tr->mxtips, full);
+      if(q->number > tr->mxtips)
+        computeTraversalInfoParsimony(q, ti, &counter, tr->mxtips, full); 
+    }
+  else
+    {
+      if(p->number > tr->mxtips && !p->xPars)
+        computeTraversalInfoParsimony(p, ti, &counter, tr->mxtips, full);
+      if(q->number > tr->mxtips && !q->xPars)
+        computeTraversalInfoParsimony(q, ti, &counter, tr->mxtips, full); 
+    }
+
+  ti[0] = counter;
+
+  result = evaluateParsimonyIterativeFast(tr, pr, perSiteScores);
+
+  return result;
+}
diff --git a/pll/pll.h b/pll/pll.h
new file mode 100644
index 0000000..e450d62
--- /dev/null
+++ b/pll/pll.h
@@ -0,0 +1,1692 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ *
+ * ABSTRACT
+ * 
+ * PLL is a highly optimized, parallelized software library to ease the
+ * development of new software tools dealing with phylogenetic inference. Among
+ * the functions included in PLL are 
+ *
+ * DOCUMENTATION
+ *
+ * Extensive documentation for using PLL is available online at
+ * 
+ *                 http://www.libpll.org
+ *
+ *
+ * USAGE
+ *
+ * To use PLL, 
+ *
+ * @file pll.h
+ * @brief Data structures for tree and model 
+ *
+ * @author Tomas Flouri
+ * @author Fernando Izquierdo-Carrasco
+ * @author Andre Aberer
+ * @author Alexandros Stamatakis
+ */
+
+#ifndef __pll__
+#define __pll__
+
+#include <stdint.h>
+#include <stdio.h>
+#include <errno.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __MIC_NATIVE
+#define PLL_BYTE_ALIGNMENT 64
+#define PLL_VECTOR_WIDTH 8
+#elif defined (__AVX)
+
+#include <xmmintrin.h>
+#include <immintrin.h>
+#include <pmmintrin.h>
+
+#define PLL_BYTE_ALIGNMENT 32
+#define PLL_VECTOR_WIDTH 4
+
+#elif defined (__SSE3)
+
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+
+#define PLL_BYTE_ALIGNMENT 16
+#define PLL_VECTOR_WIDTH 2
+
+#else
+#define PLL_BYTE_ALIGNMENT 1
+#define PLL_VECTOR_WIDTH 1
+#endif
+
+#ifdef _MSC_VER
+	#define PLL_ALIGN_BEGIN __declspec(align(PLL_BYTE_ALIGNMENT))
+	#define PLL_ALIGN_END
+#else
+	#define PLL_ALIGN_BEGIN
+	#define PLL_ALIGN_END __attribute__((aligned(PLL_BYTE_ALIGNMENT)))
+#endif
+
+
+#include "stack.h"
+#include "newick.h"
+#include "queue.h"
+
+#define PLL_MAX_TIP_EV                          0.999999999 /* max tip vector value, sum of EVs needs to be smaller than 1.0, otherwise the numerics break down */
+#define PLL_MAX_LOCAL_SMOOTHING_ITERATIONS      32          /** @brief maximum iterations of smoothings per insert in the */
+#define PLL_ITERATIONS                          10          /* maximum iterations of iterations per insert */
+#define PLL_NEWZPERCYCLE                        10           /* iterations of makenewz per tree traversal */
+#define PLL_NMLNGTH                             256         /* number of characters in species name */
+#define PLL_DELTAZ                              0.00001     /* test of net branch length change in update */
+#define PLL_DEFAULTZ                            0.9         /* value of z assigned as starting point */
+#define PLL_UNLIKELY                            -1.0E300    /* low likelihood for initialization */
+#define PLL_SUMMARIZE_LENGTH                    -3
+#define PLL_SUMMARIZE_LH                        -2
+#define PLL_NO_BRANCHES                         -1
+#define PLL_MASK_LENGTH                         32
+#define PLL_ZMIN                                1.0E-15  /* max branch prop. to -log(PLL_ZMIN) (= 34) */
+#define PLL_ZMAX                                (1.0 - 1.0E-6) /* min branch prop. to 1.0-zmax (= 1.0E-6) */
+#define PLL_TWOTOTHE256                         115792089237316195423570985008687907853269984665640564039457584007913129639936.0  /*  2**256 (exactly)  */
+#define PLL_MINLIKELIHOOD                       (1.0/PLL_TWOTOTHE256)
+#define PLL_MINUSMINLIKELIHOOD                  -PLL_MINLIKELIHOOD
+
+
+#define PLL_FORMAT_PHYLIP                       1 
+#define PLL_FORMAT_FASTA                        2
+#define PLL_FORMAT_NEWICK                       3
+
+#define PLL_NNI_P_NEXT                          1       /**< Use p->next for the NNI move */
+#define PLL_NNI_P_NEXTNEXT                      2       /**< Use p->next->next for the NNI move */
+
+#define PLL_BADREAR                             -1
+
+#define PLL_NUM_BRANCHES                        16
+
+#define PLL_TRUE                                1
+#define PLL_FALSE                               0
+
+#define PLL_REARRANGE_SPR                       0
+#define PLL_REARRANGE_TBR                       1
+#define PLL_REARRANGE_NNI                       2
+
+#define PLL_AA_SCALE                            10.0
+#define PLL_AA_SCALE_PLUS_EPSILON               10.001
+
+/* ALPHA_MIN is critical -> numerical instability, eg for 4 discrete rate cats                    */
+/* and alpha = 0.01 the lowest rate r_0 is                                                        */
+/* 0.00000000000000000000000000000000000000000000000000000000000034878079110511010487             */
+/* which leads to numerical problems Table for alpha settings below:                              */
+/*                                                                                                */
+/* 0.010000 0.00000000000000000000000000000000000000000000000000000000000034878079110511010487    */
+/* 0.010000 yielded nasty numerical bugs in at least one case !                                   */
+/* 0.020000 0.00000000000000000000000000000044136090435925743185910935350715027016962154188875    */
+/* 0.030000 0.00000000000000000000476844846859006690412039180149775802624789852441798419292220    */
+/* 0.040000 0.00000000000000049522423236954066431210260930029681736928018820007024736185030633    */
+/* 0.050000 0.00000000000050625351310359203371872643495343928538368616365517027588794007897377    */
+/* 0.060000 0.00000000005134625283884191118711474021861409372524676086868566926568746566772461    */
+/* 0.070000 0.00000000139080650074206434685544624965062437960128249869740102440118789672851562    */
+/* 0.080000 0.00000001650681201563587066858709818343436959153791576682124286890029907226562500    */
+/* 0.090000 0.00000011301977332931251259273962858978301859735893231118097901344299316406250000    */
+/* 0.100000 0.00000052651925834844387815526344648331402709118265192955732345581054687500000000    */
+
+#define PLL_ALPHA_MIN                           0.02
+#define PLL_ALPHA_MAX                           1000.0
+
+#define PLL_RATE_MIN                            0.0000001
+#define PLL_RATE_MAX                            1000000.0
+
+#define PLL_LG4X_RATE_MIN                       0.0000001
+#define PLL_LG4X_RATE_MAX                       1000.0
+
+#define PLL_FREQ_MIN                            0.001
+
+#define PLL_NUM_AA_STATES                       20
+#define PLL_NUM_DNA_STATES                      4
+
+/* 
+   previous values between 0.001 and 0.000001
+
+   TO AVOID NUMERICAL PROBLEMS WHEN FREQ == 0 IN PARTITIONED MODELS, ESPECIALLY WITH AA 
+   previous value of FREQ_MIN was: 0.000001, but this seemed to cause problems with some 
+   of the 7-state secondary structure models with some rather exotic small toy test datasets,
+   on the other hand 0.001 caused problems with some of the 16-state secondary structure models
+
+   For some reason the frequency settings seem to be repeatedly causing numerical problems
+*/
+
+#define PLL_ITMAX                               100    /* max number of iterations in brent's algorithm */
+
+#define PLL_SHFT(a,b,c,d)                       (a)=(b);(b)=(c);(c)=(d);
+#define PLL_SIGN(a,b)                           ((b) > 0.0 ? fabs(a) : -fabs(a))
+#define PLL_ABS(x)                              (((x)<0)   ?  (-(x)) : (x))
+#define PLL_MIN(x,y)                            (((x)<(y)) ?    (x)  : (y))
+#define PLL_MAX(x,y)                            (((x)>(y)) ?    (x)  : (y))
+#define PLL_SWAP(x,y)                           do{ __typeof__ (x) _t = x; x = y; y = _t; } while(0)
+#define PLL_SWAP_PTR(x,y) do{ char* _t = x; x = y; y = _t; } while(0)
+#define PLL_SWAP_INT(x,y) do{ int _t = x; x = y; y = _t; } while(0)
+
+#define PLL_POINT_GAMMA(prob,alpha,beta)        PointChi2(prob,2.0*(alpha))/(2.0*(beta))
+
+#define PLL_LIB_NAME                            "PLL"
+#define PLL_LIB_VERSION                         "1.0.1"
+#define PLL_LIB_DATE                            "November 3 2014"
+
+/* aminoacid substitution models */
+#define PLL_DAYHOFF                             0
+#define PLL_DCMUT                               1
+#define PLL_JTT                                 2
+#define PLL_MTREV                               3
+#define PLL_WAG                                 4
+#define PLL_RTREV                               5
+#define PLL_CPREV                               6
+#define PLL_VT                                  7
+#define PLL_BLOSUM62                            8
+#define PLL_MTMAM                               9
+#define PLL_LG                                  10
+#define PLL_MTART                               11
+#define PLL_MTZOA                               12
+#define PLL_PMB                                 13
+#define PLL_HIVB                                14
+#define PLL_HIVW                                15
+#define PLL_JTTDCMUT                            16
+#define PLL_FLU                                 17 
+#define PLL_AUTO                                18
+#define PLL_LG4M                                 19
+#define PLL_LG4X                                20
+#define PLL_GTR                                 21  /* GTR always needs to be the last one */
+#define PLL_NUM_PROT_MODELS                     22
+
+/* information criteria for auto protein model selection */
+#define PLL_AUTO_ML   0
+#define PLL_AUTO_BIC  1
+#define PLL_AUTO_AIC  2
+#define PLL_AUTO_AICC 3
+
+/* bipartition stuff */
+#define PLL_BIPARTITIONS_RF                     4
+
+/* scenarios for likelihood computation */
+#define PLL_TIP_TIP                             0
+#define PLL_TIP_INNER                           1
+#define PLL_INNER_INNER                         2
+
+
+/* available data types in PLL */
+#define PLL_MIN_MODEL                          -1
+#define PLL_BINARY_DATA                         0
+#define PLL_DNA_DATA                            1
+#define PLL_AA_DATA                             2
+#define PLL_SECONDARY_DATA                      3
+#define PLL_SECONDARY_DATA_6                    4
+#define PLL_SECONDARY_DATA_7                    5
+#define PLL_GENERIC_32                          6
+#define PLL_GENERIC_64                          7
+#define PLL_MAX_MODEL                           8
+
+#define PLL_SEC_6_A                             0
+#define PLL_SEC_6_B                             1
+#define PLL_SEC_6_C                             2
+#define PLL_SEC_6_D                             3
+#define PLL_SEC_6_E                             4
+
+#define PLL_SEC_7_A                             5
+#define PLL_SEC_7_B                             6
+#define PLL_SEC_7_C                             7
+#define PLL_SEC_7_D                             8
+#define PLL_SEC_7_E                             9
+#define PLL_SEC_7_F                             10
+
+#define PLL_SEC_16                              11
+#define PLL_SEC_16_A                            12
+#define PLL_SEC_16_B                            13
+#define PLL_SEC_16_C                            14
+#define PLL_SEC_16_D                            15
+#define PLL_SEC_16_E                            16
+#define PLL_SEC_16_F                            17
+#define PLL_SEC_16_I                            18
+#define PLL_SEC_16_J                            19
+#define PLL_SEC_16_K                            20
+
+#define PLL_ORDERED_MULTI_STATE                 0
+#define PLL_MK_MULTI_STATE                      1
+#define PLL_GTR_MULTI_STATE                     2
+
+
+/* available models of rate heterogeneity in PLL */
+#define PLL_CAT                                 0
+#define PLL_GAMMA                               1
+
+/* recomp */
+#define PLL_SLOT_UNUSED                        -2  /* value to mark an available vector */
+#define PLL_NODE_UNPINNED                      -3  /* marks an inner node as not available in RAM */
+#define PLL_INNER_NODE_INIT_STLEN              -1  /* initialization */
+
+#define PLL_MIN_RECOM_FRACTION     0.1 /* at least this % of inner nodes will be allocated in RAM */
+#define PLL_MAX_RECOM_FRACTION     1.0 /* always 1, just there for boundary checks */
+
+
+typedef  int pllBoolean;
+
+/* @brief PLL instance attribute structure */
+typedef struct
+{
+  int rateHetModel;
+  int fastScaling;
+  int saveMemory;
+  int useRecom;
+  long randomNumberSeed;
+  int numberOfThreads;
+} pllInstanceAttr;
+
+/** @brief Stores the recomputation-state of likelihood vectors  */
+typedef struct
+{
+  int numVectors;      /**< Number of inner vectors allocated in RAM*/
+  int *iVector;        /**< size: numVectors, stores node id || PLL_SLOT_UNUSED  */
+  int *iNode;          /**< size: inner nodes, stores slot id || PLL_NODE_UNPINNED */
+  int *stlen;          /**< Number of tips behind the current orientation of the indexed inner node (subtree size/cost) */ 
+  int *unpinnable;     /**< size:numVectors , TRUE if we dont need the vector */
+  int maxVectorsUsed;  
+  pllBoolean allSlotsBusy; /**< on if all slots contain an ancesctral node (the usual case after first full traversal) */ 
+} recompVectors;
+/* E recomp */
+
+/** @brief ???
+ * @todo add explanation, is this ever used?  */
+ 
+typedef unsigned int hashNumberType;
+
+
+
+/*typedef uint_fast32_t parsimonyNumber;*/
+
+#define PLL_PCF 32
+
+/** @brief ???Hash tables 
+ * @todo add explanation of all hash tables  */
+typedef struct pllBipartitionEntry
+{
+  unsigned int *bitVector;
+  unsigned int *treeVector;
+  unsigned int amountTips;
+  int *supportVector;
+  unsigned int bipNumber;
+  unsigned int bipNumber2;
+  unsigned int supportFromTreeset[2]; 
+  struct pllBipartitionEntry *next;
+} pllBipartitionEntry;
+
+//typedef struct
+//{
+//  hashNumberType tableSize;
+//  entry **table;
+//  hashNumberType entryCount;
+//}
+//  hashtable;
+//struct stringEnt
+//{
+//  int nodeNumber;
+//  char *word;
+//  struct stringEnt *next;
+//};
+//
+//typedef struct stringEnt stringEntry;
+//typedef struct
+//{
+//  hashNumberType tableSize;
+//  stringEntry **table;
+//}
+//  stringHashtable;
+
+typedef struct pllHashItem
+{
+  void * data;
+  char * str;
+  struct pllHashItem * next;
+} pllHashItem;
+
+typedef struct pllHashTable
+{
+  unsigned int size;
+  struct pllHashItem ** Items;
+  unsigned int entries;
+} pllHashTable;
+
+
+
+
+/** @brief Per-site Rate category entry: likelihood per-site and CAT rate applied ???
+  *
+  */
+typedef struct ratec
+{
+  double accumulatedSiteLikelihood;
+  double rate;
+}rateCategorize;
+
+/** @brief Traversal descriptor entry.
+  * 
+  * Contains the information required to execute an operation in a step of the tree traversal.
+  * q   r
+  *  \ /
+  *   p
+  *
+  * The entry defines 2 input/parent nodes (q and r) and one output/child node (p)
+  * qz represents the branch length(s) of the branch connecting q and p
+  * rz represents the branch length(s) of the branch connecting r and p
+  * PLL_TIP_TIP     Both p and r are tips
+  * PLL_INNER_INNER Both p and r are inner nodes
+  * @note PLL_TIP_INNER   q is a tip and r is an inner node (by convention, flip q and r if required)
+  */
+typedef struct
+{
+  int tipCase;                  /**< Type of entry, must be PLL_TIP_TIP PLL_TIP_INNER or PLL_INNER_INNER */
+  int pNumber;                  /**< should exist in some nodeptr p->number */
+  int qNumber;                  /**< should exist in some nodeptr q->number */
+  int rNumber;                  /**< should exist in some nodeptr r->number */
+  double qz[PLL_NUM_BRANCHES];
+  double rz[PLL_NUM_BRANCHES];
+  /* recom */
+  int slot_p;                   /**< In recomputation mode, the RAM slot index for likelihood vector of node p, otherwise unused */
+  int slot_q;                   /**< In recomputation mode, the RAM slot index for likelihood vector of node q, otherwise unused */
+  int slot_r;                   /**< In recomputation mode, the RAM slot index for likelihood vector of node r, otherwise unused */
+  /* E recom */
+} traversalInfo;
+
+/** @brief Traversal descriptor.
+  * 
+  * Describes the state of a traversal descriptor
+  */
+typedef struct
+{
+  traversalInfo *ti;              /**< list of traversal steps */
+  int count;                      /**< number of traversal steps */
+  int functionType;
+  pllBoolean traversalHasChanged;   
+  pllBoolean *executeModel;           
+  double  *parameterValues;
+} traversalData;
+
+/** @brief Node record structure
+  * 
+  * Each inner node is a trifurcation in the tree represented as a circular list containing 3 node records. One node record uniquely identifies a subtree, and the orientation of the likelihood vector within a node
+  *
+  * p1 -------> p2 ----> to the next node
+  * ^           |
+  * |-----p3<---|          
+  * 
+  */
+struct noderec;
+
+/** @brief Branch length information.
+  * 
+  * @todo add relevant info on where this is used ???
+  */
+typedef struct
+{
+  unsigned int *vector; 
+  int support;   
+  struct noderec *oP;
+  struct noderec *oQ;
+} branchInfo;
+
+
+
+
+
+/** @brief Linkage of partitions.
+  * 
+  * @todo add relevant info on where this is used ???
+  */
+typedef struct
+{
+  pllBoolean valid;
+  int partitions;  
+  int *partitionList;
+}
+  linkageData;
+typedef struct
+{
+  int entries;
+  linkageData* ld;
+}
+  linkageList;
+
+
+
+  /** 
+   *
+   * the data structure below is fundamental for representing trees 
+     in the library!
+
+     Inner nodes are represented by three instances of the nodeptr data structure that is linked 
+     via a cyclic list using the next pointer.
+
+     So for building an inner node of the tree we need to allocate three nodeptr 
+     data structures and link them together, e.g.:
+
+     assuming that we have allocated space for an inner node 
+     for nodeptr pointers p1, p2, p3, 
+
+     we would then link them like this:
+
+     p1->next = p2;
+     p2->next = p3;
+     p3->next = p1;
+
+     also note that the node number that identifies the inner node 
+     needs to be set to the same value.
+
+     for n taxa, tip nodes are enumarated/indexed from 1....n,
+     and inner node inbdices start at n+1. Assuming that we have 10 taxa 
+     and this is our first inner node, we'd initialize the number as follows:
+
+     p1->number = 11;
+     p2->number = 11;
+     p3->number = 11;
+
+     Note that the node number is important for indexing tip sequence data as well as inner likelihood vectors 
+     and that it is this number (the index) that actually gets stored in the traversal descriptor.
+
+     Tip nodes are non-cyclic nodes that simply consist of one instance/allocation of nodeptr.
+
+     if we have allocated a tip data structure nodeptr t1, 
+     we would initialize it as follows:
+
+     t1->number = 1;
+
+     t1->next = NULL;
+
+     now let's assume that we want to build a four taxon tree with tips t1, t2, t3, t4 
+     and inner nodes (p1,p2,p3) and (q1,q2,q3).
+
+     we first build the tips:
+
+     t1->number = 1;
+     t1->next = NULL;
+     
+     t2->number = 2;
+     t2->next = NULL;
+
+     t3->number = 3;
+     t3->next = NULL;
+
+     t4->number = 4;
+     t4->next = NULL;
+     
+     now the first inner node
+
+     p1->next = p2;
+     p2->next = p3;
+     p3->next = p1;    
+
+     p1->number = 5;
+     p2->number = 5;
+     p3->number = 5;
+
+     and the second inner node.
+
+     q1->next = q2;
+     q2->next = q3;
+     q3->next = q1;    
+
+     q1->number = 6;
+     q2->number = 6;
+     q3->number = 6;
+     
+     now we need to link the nodes together such that they form a tree, let's assume we want ((t1,t2), (t3, t4));
+
+     we will have to link the nodes via the so-called back pointer,
+     i.e.:
+
+     let's connect node p with t1 and t2
+
+     t1->back = p1;
+     t2->back = p2;
+
+     and vice versa:
+
+     p1->back = t1;
+     p2->back = t2;
+
+     let's connect node p with node q:
+
+     p3->back = q3;
+
+     and vice versa:
+
+     q3->back = p3;
+
+     and now let's connect node q with tips t3 and t4:
+
+     q1->back = t3;
+     q2->back = t4;
+
+     and vice versa:
+
+     t3->back = q1;
+     t4->back = q2;
+
+     What remains to be done is to set up the branch lengths.
+     Using the data structure below, we always have to store the 
+     branch length twice for each "topological branch" unfortunately.
+
+     Assuming that we are only estimating a single branch across all partitions 
+     we'd just set the first index of the branch length array z[PLL_NUM_BRANCHES].
+
+     e.g., 
+
+     t3->z[0] = q1->z[0] = 0.9;
+
+     the above operation for connecting nodes is implemented in functions hookup() which will set 
+     the back pointers of two nodes that are to be connected as well as the branch lengths.
+
+     The branchInfo data field is a pointer to a data-structure that stores meta-data and requires 
+     the tree not to change while it is being used.
+     
+     Also, this pointer needs to be set by doing a full tree traversal on the tree.
+
+     Note that q1->bInf == t3->bInf in the above example.
+
+     The hash number is used for mapping bipartitions to a hash table as described in the following paper:
+
+     A. Aberer, N. Pattengale, A. Stamatakis: "Parallelized phylogenetic post-analysis on multi-core architectures". Journal of Computational Science 1, 107-114, 2010.
+     
+     The support data field stores the support value for the branch associated with each nodeptr structure.
+     Note that support always refers to branches. 
+
+     Thus for consistency, q3->support must be equal to p3->support;
+
+     Finally, the three char fields x, xPars and xBips are very very important!
+
+     They are used to denote the presence/absence or if you want, direction of the 
+     parsimony, bipartition, or likelihood vector at a node with respect to the virtual root.
+
+     Essentially, they are just used as single presence/absence bits and ONLY for inner nodes!
+
+     When setting up new inner nodes, one of the three pointers in the cyclic list must 
+     have x = 1 and the other two x = 0;
+
+     in the above example we could set:
+
+     p1->x = 0;
+     p2->x = 0;
+     p3->x = 1;
+
+     q1->x = 0;
+     q2->x = 0;
+     q3->x = 1;
+
+     This would mean that the virtual root is located at the inner branch of the four taxon tree ((t1,t2),(t3,t4));
+
+     When we re-root the tree at some other branch we need to update the location of the x pointer that is set to 1.
+
+     This means if we root the tree at the branch leading to t1 we would set 
+
+     p1->x = 1;
+     p2->x = 0;
+     p3->x = 0;
+
+     the values for q remaon unchanged since q3 is still pointing toward the root.
+
+     When we re-locate the root to branch p1 <-> t1 the fact that we have to "rotate" the x value that is set to 1
+     to another node of the cyclic list representing the abstract topological node p, also tells us that we 
+     need to re-compute the conditional likelihood array for p. 
+
+     Note that, only one likelihood or parsimony array is stored per inner node and the location of x essentially tells us which subtree 
+     it summarizes, if p1->x == 1, it summarizes subtree (t2, (t3, t4)), if p3->x = 1 the likelihood vector associated with 
+     node p summarizes subtree (t1, t2).
+
+     @todo I think we should rename the back pointer. It's not back, it can be forward depending on the orientation. We should renmae it to outer. Back is too confusing, I would assume it's the opposite of next, i.e. previous.
+
+     @struct noderec
+
+     @brief Tree node record
+
+     A node in a tree is a structure which contains a cyclic list of pointers to 3 nodes which we call a \e roundabout. The first node is the structure itself, and the other two nodes are accessed via \a noderec->next and \a noderec->next->next. To access the outer node with which each of the 3 nodes forms an edge one has to use the \a back pointer
+
+     @var noderec::next
+     @brief Next node in the roundabout
+
+     @var noderec::back
+     @brief Outer node
+
+     @var noderec::number
+     @brief Node identifier
+
+     In general, tips (i.e. leaves) are numbered from 1 to \e n where \e n is the number of taxa. Identifiers for internal nodes start from \e n + 1. Note
+     that for a given inner node, the identifier must be the same for all 3 nodes that compose it.
+
+     @var info::z
+     @brief The branch lengths per partition for the main node in the roundabout
+
+     @todo Append an image
+  */
+typedef  struct noderec
+{
+ 
+  branchInfo      *bInf;
+  double           z[PLL_NUM_BRANCHES];
+  struct noderec  *next;        
+  struct noderec  *back;       
+  hashNumberType   hash;
+  int              support;
+  int              number;    
+  char             x;
+  char             xPars;
+  char             xBips;
+}
+  node, *nodeptr;
+
+typedef unsigned int parsimonyNumber;
+
+/* @brief Alignment, transition model, model of rate heterogenety and likelihood vectors for one partition.
+  * 
+  * @todo De-couple into smaller data structures
+  *
+  * ALIGNMENT DATA 
+  * This depends only on the type of data in this partition of the alignment 
+  *
+  * MODEL OF RATE HETEROGENETY, We use either GAMMA or PSR 
+  * Rate heterogenety: Per Site Categories (PSR) model aka CAT, 
+  * Rate of site i is given by perSiteRates[rateCategory[i]]
+  *
+  * TRANSITION MODEL: We always assume General Time Reversibility 
+  * Transistion probability matrix: P(t) = exp(Qt)
+  * Branch length t is the expected number of substitutions per site 
+  * Pij(t) is the probability of going from state i to state j in a branch of length t 
+  * Relative substitution rates (Entries in the Q matrix) 
+  * In GTR we can write Q = S * D, where S is a symmetrical matrix and D a diagonal with the state frequencies 
+
+    @var protModels
+    @brief Protein models
+
+    @detail Detailed protein models descriptiopn
+
+    @var autoProtModels
+    @brief Auto prot models
+    @detail Detailed auto prot models
+  */
+ 
+
+
+/** @struct pInfo
+    
+    @brief Partition information structure
+
+    This data structure encapsulates all properties and auxiliary variables that together
+    consist a partition.
+
+    @var pInfo::dataType
+    @brief Type of data this partition contains
+
+    Can be DNA (\b PLL_DNA_DATA) or AminoAcid (\b PLL_AA_DATA) data
+
+    @var pInfo::states
+    @brief Number of states
+
+    Number of states this type of data can consist of
+
+    @var pInfo::maxTipStates
+    @brief Number of undetermined states (possible states at the tips)
+
+    This is the total number of possible states that can appear in the alignment. This includes degenerate (undetermined) bases
+
+    @var pInfo::partitionName
+    @brief Name of partition
+
+    A null-terminated string describing the name of partition
+
+    @var pInfo::lower
+    @brief Position of the first site in the alignment that is part of this partition [1, tr->originalCrunchedLength]
+
+    @var pInfo::upper
+    @brief Position of the last site that is part of this partition plus one (i.e. position of the first site that is not part of this partition) 
+
+    @var pInfo::width
+    @brief Number of sites in the partition (i.e. \a upper - \a lower)
+
+    @var pInfo::wgt
+    @brief Weight of site
+
+    Number of times this particular site appeared in the partition before the duplicates were removed and replaced by this weight
+
+    @var pInfo::empiricalFrequencies
+    @brief Empirical frequency of each state in the current partition
+
+    @var pInfo::perSiteRates
+    @brief Per Site Categories (PSR) or aka CAT values for each rate
+
+    @var pInfo::rateCategory
+    @brief CAT category index for each site
+
+    @var pInfo::numberOfCategories
+    @brief CAT size of the set of possible categories
+
+    @var pInfo::alpha
+    @brief Gamma parameter to be optimized
+    
+    @var pInfo::gammaRates
+    @brief Values of the 4 gamma categories (rates) computed given an alpha
+
+    @var pInfo::substRates
+    @brief Entries of substitution matrix, e.g. 6 free parameters in DNA
+
+    In GTR we can write \f$ Q = S * D \f$, where \f$ S \f$ is a symmetrical matrix and \f$ D \f$ a diagonal with the state frequencies,
+    which is represented by the array \a frequencies. The symmetrical matrix is the array \a substRates
+
+    @var pInfo::frequencies
+    @brief State frequencies, entries in D are initialized as empiricalFrequencies
+    
+    In GTR we can write \f$ Q = S * D \f$, where \f$ S \f$ is a symmetrical matrix and \f$ D \f$ a diagonal with the state frequencies,
+    which is represented by the array \a frequencies. The symmetrical matrix is the array \a substRates
+
+    @var pInfo::freqExponents
+
+    @var pInfo::EIGN
+    @brief Eigenvalues of Q matrix
+
+    @var pInfo::EV
+    @brief Eigenvectors of Q matrix
+
+    @var pInfo::EI
+    @brief Inverse eigenvectors of Q matrix
+
+    @var pInfo::left
+    @brief P matrix for the left term of the conditional likelihood equation
+
+    @var pInfo::right
+    @brief P matrix for the right term of the conditional likelihood equation
+
+    @var pInfo::tipVector
+    @brief Precomputed (based on current P matrix) conditional likelihood vectors for every possible base 
+
+    @var pInfo::EIGN_LG4
+    @brief Eigenvalues of Q matrix for the LG4 model
+
+    @var pInfo::EV_LG4
+    @brief Eigenvectors of Q matrix for the LG4 model
+
+    @var pInfo::EI_LG4
+    @brief Inverse eigenvectors of Q matrix for the LG4 model
+    
+    @var pInfo::frequencies_LG4
+    @brief State frequencies for the LG4 model
+
+    @var pInfo::tipVector_LG4
+    @brief Precomputed (based on current P matrix) conditional likelihood vectors for every possible base for the LG4 model
+
+    @var pInfo::substRates_LG4
+    @brief Entries of substitution matrix for the LG4 model
+
+    @var pInfo::protModels
+    @brief Protein model for current partition
+
+    In case \a pInfo::dataType is set to \a PLL_AA_DATA then \a protModels indicates the index in the global array \a protModels
+    of the protein model that the current partition uses.
+
+    @var pInfo::autoProtModels
+    @brief Best fitted protein model for the \b PLL_AUTO partitions
+
+    If \a protModels is set to \b PLL_AUTO then \a autoProtModels holds the currently detected best fitting protein model for the partition
+
+    @var pInfo::protUseEmpiricalFreqs
+
+    @var pInfo::nonGTR
+
+    @var pInfo::optimizeBaseFrequencies
+
+    @var pInfo::optimizeAlphaParameter
+
+    @var pInfo::optimizeSubstitutionRates
+
+    @var pInfo::symmetryVector
+
+    @var pInfo::frequencyGrouping
+
+
+    @todo
+      Document freqExponents
+
+*/
+
+
+
+typedef struct {
+  int     dataType;
+  int     states;
+  int     maxTipStates;
+  char   *partitionName;
+  int     lower;
+  int     upper;
+  int     width;
+  int    *wgt;
+  double *empiricalFrequencies; 
+
+
+  /* MODEL OF RATE HETEROGENETY, We use either GAMMA or PSR */
+  /* Rate heterogenety: Per Site Categories (PSR) model aka CAT, see updatePerSiteRates() */
+  /* Rate of site i is given by perSiteRates[rateCategory[i]] */
+  double *perSiteRates;
+  int    *rateCategory;
+  int     numberOfCategories;
+  /* Rate heterogenety: GAMMA model of rate heterogenety */
+  double alpha;
+  double *gammaRates;
+
+
+  /* TRANSITION MODEL: We always assume General Time Reversibility */
+  /* Transistion probability matrix: P(t) = exp(Qt)*/
+  /* Branch length t is the expected number of substitutions per site */
+  /* Pij(t) is the probability of going from state i to state j in a branch of length t */
+  /* Relative substitution rates (Entries in the Q matrix) */
+  /* In GTR we can write Q = S * D, where S is a symmetrical matrix and D a diagonal with the state frequencies */
+  double *substRates;       /**< TRANSITION MODEL Entries in S, e.g. 6 free parameters in DNA */   
+  double *frequencies;      /**< State frequencies, entries in D, are initialized as empiricalFrequencies */
+  double *freqExponents;
+  /* Matrix decomposition: @todo map this syntax to Explanation of the mathematical background */
+  double *EIGN;
+  double *EV;
+  double *EI;
+  double *left;
+  double *right;
+  double *tipVector;
+
+
+  /* asc bias */
+  pllBoolean       ascBias;
+  int           ascOffset;
+  int         * ascExpVector;
+  double      * ascSumBuffer;
+  double      * ascVector;
+  double        ascScaler[64];
+  
+  /* LG4 */
+
+  double *EIGN_LG4[4];
+  double *EV_LG4[4];
+  double *EI_LG4[4];
+
+  double *frequencies_LG4[4];
+  double *tipVector_LG4[4];
+  double *substRates_LG4[4];
+  
+  /* LG4X */
+
+  double lg4x_weights[4];
+  double lg4x_weightExponents[4];
+  double lg4x_weightsBuffer[4];
+  double lg4x_weightExponentsBuffer[4];
+  double lg4x_weightLikelihood;
+  
+  /* Protein specific */
+  int     protModels;			/**< Empirical model matrix */
+  int     autoProtModels;		/**< Model selected with "auto" protein model */
+  int     protUseEmpiricalFreqs;	/**< Whether to use empirical frequencies for protein model */
+
+  pllBoolean nonGTR;
+  pllBoolean optimizeBaseFrequencies;	/**< Whether to optimize base frequencies */
+  pllBoolean optimizeAlphaParameter;	/**< Whether to optimize alpha parameters and gamma rates */
+  pllBoolean optimizeSubstitutionRates;	/**< Whether to optimize substitution rates */
+  int    *symmetryVector;		/**< Specify linkage between substitution rate parameters */
+  int    *frequencyGrouping;
+
+  /* LIKELIHOOD VECTORS */
+
+  /* partial LH Inner vectors  ancestral vectors, we have 2*tips - 3 inner nodes */
+  double          **xVector;          /**< Conditional likelihood vectors for inner nodes */
+  unsigned char   **yVector;          /**< Tip entries (sequence) for tip nodes */
+  unsigned int     *globalScaler;     /**< Counters for scaling operations done at node i */
+
+  /* data structures for conducting per-site likelihood scaling.
+     this allows to compute the per-site log likelihood scores 
+     needed for RELL-based bootstrapping and all sorts of statistical 
+     tests for comparing trees ! */
+  int              **expVector;     /**< @brief An entry per inner node. Each element is an array of size the number of sites in the current partition and represents how many times the respective site has been scaled in the subtree rooted at the current node */
+  size_t           *expSpaceVector; /**< @brief Each entry represents an inner node and states the size of the corresponding element in \a expVector, which is the number of sites for the current partition */
+
+  /* These are for the saveMemory option (tracking gaps to skip computations and memory) */
+  size_t           *xSpaceVector;       /* Size of conditional likelihood vectors per inner node */
+  int               gapVectorLength;    /** Length of \a gapVector bitvector in unsigned integers assuming that \a unsigned \a int is 32bits. It is set to partition size / 32 */
+  unsigned int     *gapVector;          /** A bit vector of size \a gapVectorLength * 32 bits. A bit is set to 1 if the corresponding */
+  double           *gapColumn; 
+
+  /* Parsimony vectors at each node */
+  size_t parsimonyLength;
+  parsimonyNumber *parsVect; 
+  parsimonyNumber *perSiteParsScores;
+
+  /* This buffer of size width is used to store intermediate values for the branch length optimization under 
+     newton-raphson. The data in here can be re-used for all iterations irrespective of the branch length.
+   */
+  double *sumBuffer; 
+
+  /* Buffer to store the per-site log likelihoods */
+  double *perSiteLikelihoods;
+
+  /* This buffer of size width is used to store the ancestral state at a node of the tree. */
+  double *ancestralBuffer;
+
+  /* From tree */
+  pllBoolean executeModel;
+  double fracchange;
+  double rawFracchange;
+  double partitionContribution;
+  double partitionWeight;
+  double partitionLH;
+
+// #if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI))
+  int partitionAssignment;
+// #endif
+
+} pInfo;
+
+typedef struct
+ {
+   pInfo **partitionData;
+   int numberOfPartitions;
+   pllBoolean perGeneBranchLengths;
+   pllBoolean dirty;
+   linkageList *alphaList;
+   linkageList *rateList;
+   linkageList *freqList;
+ }  partitionList;
+
+
+
+#define PLL_REARR_SETTING 1
+#define PLL_FAST_SPRS     2
+#define PLL_SLOW_SPRS     3
+
+
+/** @brief Checkpointing states. 
+ * 
+ * @todo Raxml specific 
+  */
+typedef struct {
+ 
+  int state;
+
+  /*unsigned int vLength;*/
+  double accumulatedTime;  
+  int rearrangementsMax;
+  int rearrangementsMin;
+  int thoroughIterations;
+  int fastIterations;
+  int mintrav;
+  int maxtrav;
+  int bestTrav;
+  double startLH; 
+  double lh;
+  double previousLh;
+  double difference;
+  double epsilon;  
+  pllBoolean impr;
+  pllBoolean cutoff;  
+       
+  double tr_startLH;
+  double tr_endLH;
+  double tr_likelihood;
+  double tr_bestOfNode;  
+  double tr_lhCutoff;
+  double tr_lhAVG;
+  double tr_lhDEC;
+  int    tr_NumberOfCategories;
+  int    tr_itCount;  
+  int    tr_doCutoff;
+  int    tr_thoroughInsertion;
+  int    tr_optimizeRateCategoryInvocations;
+ 
+  /* prevent users from doing stupid things */
+ 
+  int searchConvergenceCriterion;
+  int rateHetModel;
+  int maxCategories;
+  int NumberOfModels;
+  int numBranches;
+  int originalCrunchedLength;    
+  int mxtips;
+  char seq_file[1024];
+} checkPointState;
+
+
+
+/* recomp */
+#ifdef _DEBUG_RECOMPUTATION
+typedef struct {
+  unsigned long int numTraversals;
+  unsigned long int tt;
+  unsigned long int ti;
+  unsigned long int ii;
+  unsigned int *travlenFreq;
+} traversalCounter;
+#endif
+/* E recomp */
+
+
+/** @brief Tree topology.
+ * 
+ * @todo Apart from the topology this structure contains several fields that act like global variables in raxml
+  */
+typedef  struct  {
+
+  int *ti;
+
+  /* recomp */
+  recompVectors *rvec;            /**< this data structure tracks which vectors store which nodes */
+  float maxMegabytesMemory;       /**< User says how many MB in main memory should be used */
+  float vectorRecomFraction;      /**< vectorRecomFraction ~= 0.8 * maxMegabytesMemory  */
+  pllBoolean useRecom;               /**< ON if we apply recomputation of ancestral vectors*/
+#ifdef _DEBUG_RECOMPUTATION 
+  traversalCounter *travCounter;
+  double stlenTime;
+#endif
+  /* E recomp */
+  
+  pllBoolean fastScaling;
+  pllBoolean saveMemory;
+  int              startingTree;
+  long             randomNumberSeed;
+
+  double          *lhs;         /**< Array to store per-site log likelihoods of \a originalCrunchedLength (compressed) sites */
+  double          *patrat;      /**< rates per pattern */
+  double          *patratStored; 
+  int             *rateCategory;
+  int             *aliaswgt;    /**< weight by pattern */ 
+  pllBoolean    manyPartitions;
+
+  pllBoolean grouped;              /**< No idea what this is, but is always set to PLL_FALSE */
+  pllBoolean constrained;          /**< No idea what this is, but is always set to PLL_FALSE */
+  int threadID;
+  volatile int numberOfThreads;
+
+//#if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI))
+ 
+  unsigned char *y_ptr; 
+  
+  double lower_spacing;
+  double upper_spacing; 
+
+  double *ancestralVector;
+
+//#endif
+  
+  pllHashTable     *nameHash;
+  char           ** tipNames;
+
+  char             *secondaryStructureInput;
+
+  traversalData    td[1];
+
+  int              maxCategories;
+  int              categories;
+
+  double           coreLZ[PLL_NUM_BRANCHES];
+  
+ 
+  branchInfo       *bInf;
+
+  int              multiStateModel;
+
+
+  pllBoolean curvatOK[PLL_NUM_BRANCHES];
+
+  /* the stuff below is shared among DNA and AA, span does
+     not change depending on datatype */
+
+  /* model stuff end */
+  unsigned char    **yVector;        /**< list of raw sequences (parsed from the alignment)*/
+
+  int              secondaryStructureModel;
+  int              originalCrunchedLength; /**< Length of alignment after removing duplicate sites in each partition */
+
+  int              *secondaryStructurePairs;
+
+  double            fracchange;      /**< Average substitution rate */
+  double            rawFracchange;
+  double            lhCutoff;
+  double            lhAVG;
+  unsigned long     lhDEC;
+  unsigned long     itCount;
+  int               numberOfInvariableColumns;
+  int               weightOfInvariableColumns;
+  int               rateHetModel;
+
+  double           startLH;
+  double           endLH;
+  double           likelihood;           /**< last likelihood value evaluated for the current topology */
+ 
+  node           **nodep;                /**< pointer to the list of nodes, which describe the current topology */
+  nodeptr          nodeBaseAddress;
+  node            *start;                /**< starting node by default for full traversals (must be a tip contained in the tree we are operating on) */
+  int              mxtips;  /**< Number of tips in the topology */
+
+  int              *constraintVector;   /**< @todo What is this? */
+  int              numberOfSecondaryColumns;
+  pllBoolean          searchConvergenceCriterion;
+  int              ntips;
+  int              nextnode;  
+
+  pllBoolean          bigCutoff;
+  pllBoolean          partitionSmoothed[PLL_NUM_BRANCHES];
+  pllBoolean          partitionConverged[PLL_NUM_BRANCHES];
+  pllBoolean          rooted;
+  pllBoolean          doCutoff;
+ 
+  double         gapyness;
+
+  char **nameList;     /**< list of tips names (read from the phylip file) */
+  char *tree_string;   /**< the newick representaion of the topology */
+  char *tree0;
+  char *tree1;
+  int treeStringLength;
+ 
+  unsigned int bestParsimony;
+  unsigned int *parsimonyScore;
+  
+  double bestOfNode;
+  nodeptr removeNode;   /**< the node that has been removed. Together with \a insertNode represents an SPR move */
+  nodeptr insertNode;   /**< the node where insertion should take place . Together with \a removeNode represents an SPR move*/
+
+  double zqr[PLL_NUM_BRANCHES];
+  double currentZQR[PLL_NUM_BRANCHES];
+
+  double currentLZR[PLL_NUM_BRANCHES];
+  double currentLZQ[PLL_NUM_BRANCHES];
+  double currentLZS[PLL_NUM_BRANCHES];
+  double currentLZI[PLL_NUM_BRANCHES];
+  double lzs[PLL_NUM_BRANCHES];
+  double lzq[PLL_NUM_BRANCHES];
+  double lzr[PLL_NUM_BRANCHES];
+  double lzi[PLL_NUM_BRANCHES];
+
+
+  unsigned int **bitVectors;
+
+  unsigned int vLength;
+
+  pllHashTable *h;                 /**< hashtable for ML convergence criterion */
+  //hashtable *h;
+ 
+  int optimizeRateCategoryInvocations;
+
+  checkPointState ckp;
+  pllBoolean thoroughInsertion; /**< true if the neighbor branches should be optimized when a subtree is inserted (slower)*/
+  pllBoolean useMedian;
+
+  int autoProteinSelectionType;
+
+  pllStack * rearrangeHistory;
+
+
+  /* analdef defines */
+  /* TODO: Do some initialization */
+  int              bestTrav;            /**< best rearrangement radius */
+  int              max_rearrange;       /**< max. rearrangemenent radius */
+  int              stepwidth;           /**< step in rearrangement radius */
+  int              initial;             /**< user defined rearrangement radius which also sets bestTrav if initialSet is set */
+  pllBoolean          initialSet;          /**< set bestTrav according to initial */
+  int              mode;                /**< candidate for removal */
+  pllBoolean        perGeneBranchLengths;
+  pllBoolean        permuteTreeoptimize;   /**< randomly select subtrees for SPR moves */
+  pllBoolean        compressPatterns;
+  double         likelihoodEpsilon;
+  pllBoolean        useCheckpoint;
+
+} pllInstance;
+
+/** @brief Stores data related to a NNI move  */
+typedef struct {
+        pllInstance * tr;
+        nodeptr p;
+        int nniType;
+        double z[PLL_NUM_BRANCHES]; // optimize branch lengths
+        double z0[PLL_NUM_BRANCHES]; // unoptimized branch lengths
+        double likelihood;
+        double deltaLH;
+} nniMove;
+
+/***************************************************************/
+
+typedef struct {
+  int partitionNumber;
+  int partitionLength;
+} partitionType;
+
+typedef struct
+{
+  double z[PLL_NUM_BRANCHES];
+  nodeptr p, q;
+  int cp, cq;
+}
+  connectRELL, *connptrRELL;
+
+typedef  struct
+{
+  connectRELL     *connect; 
+  int             start;
+  double          likelihood;
+}
+  topolRELL;
+
+
+typedef  struct
+{
+  int max;
+  topolRELL **t;
+}
+  topolRELL_LIST;
+
+/**************************************************************/
+
+/** @brief Connection within a topology.
+*   */
+typedef struct conntyp {
+    double           z[PLL_NUM_BRANCHES];           /**< branch length */
+    node            *p, *q;       /**< parent and child sectors */
+    void            *valptr;      /**< pointer to value of subtree */
+    int              descend;     /**< pointer to first connect of child */
+    int              sibling;     /**< next connect from same parent */
+    } pllConnect, *connptr;
+
+/** @brief Single Topology
+*   */
+typedef  struct {
+    double           likelihood;
+    int              initialTreeNumber;
+    pllConnect         *links;       /**< pointer to first connect (start) */
+    node            *start;
+    int              nextlink;    /**< index of next available connect */
+                                  /**< tr->start = tpl->links->p */
+    int              ntips;
+    int              nextnode;    /**< next available inner node for tree parsing */
+    int              scrNum;      /**< position in sorted list of scores */
+    int              tplNum;      /**< position in sorted list of trees */
+    } topol;
+
+/** @brief small helper data structure for printing out/downstream use of marginal ancestral probability vectors.
+*
+* it is allocated as an array that has the same length as the input alignment and can be used to 
+*   index the ancestral states for each position/site/pattern 
+*   */
+typedef struct {
+  double *probs; /**< marginal ancestral states */
+  char c; /**< most likely stated, i.e. max(probs[i]) above */
+  int states; /**< number of states for this position */
+} ancestralState;
+
+/** @brief List of topologies
+*
+*   */
+typedef struct {
+    double           best;        /**< highest score saved */
+    double           worst;       /**< lowest score saved */
+    topol           *start;       /**< starting tree for optimization */
+    topol          **byScore;
+    topol          **byTopol;
+    int              nkeep;       /**< maximum topologies to save */
+    int              nvalid;      /**< number of topologies saved */
+    int              ninit;       /**< number of topologies initialized */
+    int              numtrees;    /**< number of alternatives tested */
+    pllBoolean          improved;
+    } bestlist;
+
+/** @brief  This is used to look up some hard-coded data for each data type 
+*   */
+typedef struct 
+{
+  int leftLength;         /**< s^2 */
+  int rightLength;/**< s^2 */
+  int eignLength;/**<  s */
+  int evLength;
+  int eiLength;
+  int substRatesLength;   /**< (s^2 - s)/2 free model parameters for matrix Q i.e. substitution rates */
+  int frequenciesLength;  /**< s frequency of each state */ 
+  int tipVectorLength;    /* ??? */
+  int symmetryVectorLength;
+  int frequencyGroupingLength;
+
+  pllBoolean nonGTR;
+  pllBoolean optimizeBaseFrequencies;
+
+  int undetermined;
+
+  const char *inverseMeaning;
+
+  int states;   /* s */
+
+  pllBoolean smoothFrequencies;
+
+  const unsigned  int *bitVector;
+
+} partitionLengths;
+
+typedef struct
+{
+  int rearrangeType;
+  double  likelihood;
+
+  union {
+    struct {
+      double * zp;
+      double * zpn;
+      double * zpnn;
+      double * zqr;
+      nodeptr pn;
+      nodeptr pnn;
+      nodeptr r;
+      nodeptr p;
+      nodeptr q;
+    } SPR;
+    struct {
+      nodeptr origin;
+      int swapType;
+      double z[PLL_NUM_BRANCHES];
+    } NNI;
+  };
+} pllRollbackInfo;
+
+
+/** @struct pllRearrangeAttr
+ 
+    @brief Structure holding attributes for searching possible tree rearrangements
+    
+    Holds the attributes for performing tree rearrangements.
+
+    @var pllRearrangeAttr
+      The origin node where the search should start
+
+    @var pllRearrangeAttr:mintrav
+      The minimum radius around the origin node \a p for which nodes should be tested
+
+    @var pllRearrangeAttr:maxtrav
+      The maximum radius around the origin node \a p for which nodes should be tested
+
+    @var pllRearrangeAttr:max
+      Maximum number of results to be returned
+*/
+typedef struct
+ {
+   nodeptr p;
+   int mintrav;
+   int maxtrav;
+ } pllRearrangeAttr;
+
+/** @typedef pllRearrangeInfo
+    
+    @brief Tree rearrangement information structure
+
+    Holds information for conducting tree arrangements. This structure
+    is the result of a tree arrangement search under given search
+    attributes.
+
+    @var pllRearrangeInfo::rearrangeType
+      Type of rearrangement. Can be \b PLL_REARRANGE_SPR, \b PLL_REARRANGE_NNI or
+      \b PLL_REARRANGE_TBR
+    
+    @var pllRearrangeInfo::likelihood
+      Holds the computed likelihood for the addressed rearrangement
+
+    @var pllRearrangeInfo::SPR::removeNode
+      Node where to perform subtree pruning
+
+    @var pllRearrangeInfo::SPR::insertNode
+      Node where to place the pruned subtree
+
+    @var pllRearrangeInfo::zqr
+      Holds the computed branch lengths after the SPR
+*/
+typedef struct
+ {
+   int rearrangeType;
+   double  likelihood;
+   union {
+     struct {
+       nodeptr removeNode;
+       nodeptr insertNode;
+       double  zqr[PLL_NUM_BRANCHES];
+     } SPR;
+     struct {
+       nodeptr originNode;
+       int     swapType;
+     } NNI;
+   };
+ } pllRearrangeInfo;
+
+
+typedef struct
+ {
+   int max_entries;
+   int entries;
+   pllRearrangeInfo * rearr;
+ } pllRearrangeList;
+
+/** @brief Generic structure for storing a multiple sequence alignment */
+typedef struct
+ {
+   int              sequenceCount;      /**< @brief Number of sequences */
+   int              sequenceLength;     /**< @brief Length of sequences */
+   int              originalSeqLength;  /**< @brief Original length of sequences (not modified after removing duplicates) */
+   char          ** sequenceLabels;     /**< @brief An array of where the \a i-th element is the name of the \a i-th sequence */
+   unsigned char ** sequenceData;       /**< @brief The actual sequence data */
+   int            * siteWeights;        /**< @brief An array where the \a i-th element indicates how many times site \a i appeared (prior to duplicates removal) in the alignment */
+ } pllAlignmentData;
+
+
+/******************** START OF API FUNCTION DESCRIPTIONS ********************/
+
+#if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI))
+pllBoolean isThisMyPartition(partitionList *pr, int tid, int model);
+void printParallelTimePerRegion(void); 
+#endif
+
+#ifdef _FINE_GRAIN_MPI
+extern void pllFinalizeMPI (void);
+#endif
+
+
+
+/**
+ * @brief Create the main instance of PLL
+ *   
+ * Create an instance of the phylogenetic likelihood library
+ *
+ * @param rateHetModel   Rate heterogeneity model
+ * @param fastScaling    TODO: explain fastScaling here
+ * @param saveMemory     TODO: explain saveMemory here
+ * @param useRecom       If set to \b PLL_TRUE, enables ancestral state recomputation
+ * 
+ * @todo                 Document fastScaling, rate heterogeneity and saveMemory and useRecom
+ *
+ * @note                 Do not set \a saveMemory to when using \a useRecom as memory saving 
+ *                       techniques are not yet implemented for ancestral state recomputation. 
+ * 
+ * @return               On success returns an instance to PLL, otherwise \b NULL
+ */
+extern pllInstance * pllCreateInstance (pllInstanceAttr * pInst);
+
+/** 
+ *  @ingroup instanceLinkingGroup
+ *  @brief Load alignment to the PLL instance
+ *   
+ *   Loads (copies) the parsed alignment \a alignmentData to the PLL instance
+ *   as a deep copy.
+ * 
+ *    @param tr              The library instance
+ *    @param alignmentData   The multiple sequence alignment
+ *    @param partitions      List of partitions
+ *
+ *    @return Returns 1 in case of success, 0 otherwise.
+ */
+extern int pllLoadAlignment (pllInstance * tr, 
+                             pllAlignmentData * alignmentData, 
+                             partitionList * pList);
+
+/**
+ * @brief Compute the empirical base frequencies for all partitions
+ *
+ * Compute the empirical base frequencies for all partitions in the list \a pl.
+ *
+ * @param pl                Partition list
+ * @param alignmentData     Multiple sequence alignment
+ *
+ * @return   A list of \a pl->numberOfPartitions arrays each of size
+             \a pl->partitionData[i]->states, where \a i is the \a i-th partition
+*/
+extern double ** pllBaseFrequenciesAlignment (pllAlignmentData * alignmentData, partitionList * pl);
+extern double ** pllBaseFrequenciesInstance (pllInstance * tr, partitionList * pl);
+
+/* pthreads and MPI */
+extern void pllStartPthreads (pllInstance *tr, partitionList *pr);
+extern void pllStopPthreads (pllInstance * tr);
+extern void pllLockMPI (pllInstance * tr);
+extern void pllInitMPI(int * argc, char **argv[]);
+
+
+/* handling branch lengths*/
+extern double pllGetBranchLength (pllInstance *tr, nodeptr p, int partition_id);
+extern void pllSetBranchLength (pllInstance *tr, nodeptr p, int partition_id, double bl);
+extern int pllNniSearch(pllInstance * tr, partitionList *pr, int estimateModel);
+extern void pllOptimizeBranchLengths ( pllInstance *tr, partitionList *pr, int maxSmoothIterations );
+
+
+extern void pllEvaluateLikelihood (pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean fullTraversal, pllBoolean getPerSiteLikelihoods);
+extern void pllUpdatePartials (pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean masked);
+extern void pllUpdatePartialsAncestral(pllInstance *tr, partitionList *pr, nodeptr p);
+extern void pllNewviewIterative(pllInstance *tr, partitionList *pr, int startIndex);
+extern void pllEvaluateIterative(pllInstance *tr, partitionList *pr, pllBoolean getPerSiteLikelihoods);
+
+/* newick parser declarations */
+extern pllNewickTree * pllNewickParseString (const char * newick);
+extern pllNewickTree * pllNewickParseFile (const char * filename);
+extern int pllValidateNewick (pllNewickTree *);
+extern void pllNewickParseDestroy (pllNewickTree **);
+extern int pllNewickUnroot (pllNewickTree * t);
+extern char * pllTreeToNewick ( char *treestr, pllInstance *tr, partitionList *pr, nodeptr p,
+      pllBoolean printBranchLengths, pllBoolean printNames, pllBoolean printLikelihood,
+      pllBoolean rellTree, pllBoolean finalPrint, int perGene,
+      pllBoolean branchLabelSupport, pllBoolean printSHSupport);
+
+/* partition parser declarations */
+extern void  pllQueuePartitionsDestroy (pllQueue ** partitions);
+extern pllQueue * pllPartitionParse (const char * filename);
+extern pllQueue * pllPartitionParseString (const char * p);
+extern void pllPartitionDump (pllQueue * partitions);
+void pllBaseSubstitute (pllInstance * tr, partitionList * partitions);
+//void pllBaseSubstitute (pllAlignmentData * tr, partitionList * partitions);
+partitionList * pllPartitionsCommit (pllQueue * parts, pllAlignmentData * alignmentData);
+int pllPartitionsValidate (pllQueue * parts, pllAlignmentData * alignmentData);
+extern void pllAlignmentRemoveDups (pllAlignmentData * alignmentData, partitionList * pl);
+void pllPartitionsDestroy (pllInstance *, partitionList **);
+
+/* alignment data declarations */
+extern void pllAlignmentDataDestroy (pllAlignmentData *);
+extern int pllAlignmentDataDumpFile (pllAlignmentData *, int, const char *);
+extern void pllAlignmentDataDumpConsole (pllAlignmentData * alignmentData);
+extern pllAlignmentData * pllInitAlignmentData (int, int);
+extern pllAlignmentData * pllParseAlignmentFile (int fileType, const char *);
+extern pllAlignmentData *pllParsePHYLIPString (const char *rawdata, long filesize);
+
+
+/* model management */
+int pllInitModel (pllInstance *, partitionList *);
+void pllInitReversibleGTR(pllInstance * tr, partitionList * pr, int model);
+void pllMakeGammaCats(double alpha, double *gammaRates, int K, pllBoolean useMedian);
+int pllLinkAlphaParameters(char *string, partitionList *pr);
+int pllLinkFrequencies(char *string, partitionList *pr);
+int pllLinkRates(char *string, partitionList *pr);
+int pllSetSubstitutionRateMatrixSymmetries(char *string, partitionList * pr, int model);
+void pllSetFixedAlpha(double alpha, int model, partitionList * pr, pllInstance *tr);
+void pllSetFixedBaseFrequencies(double *f, int length, int model, partitionList * pr, pllInstance *tr);
+int  pllSetOptimizeBaseFrequencies(int model, partitionList * pr, pllInstance *tr);
+void pllSetSubstitutionMatrix(double *q, int length, int model, partitionList * pr,  pllInstance *tr);
+void pllSetFixedSubstitutionMatrix(double *q, int length, int model, partitionList * pr,  pllInstance *tr);
+int pllGetInstRateMatrix (partitionList * pr, int model, double * outBuffer);
+int pllOptimizeModelParameters(pllInstance *tr, partitionList *pr, double likelihoodEpsilon);
+double pllGetAlpha (partitionList * pr, int pid);
+void pllGetGammaRates (partitionList * pr, int pid, double * outBuffer);
+extern void pllGetBaseFrequencies(partitionList * pr, int model, double * outBuffer);
+extern void pllGetSubstitutionMatrix (partitionList * pr, int model, double * outBuffer);
+void pllEmpiricalFrequenciesDestroy (double *** empiricalFrequencies, int models);
+extern void pllOptRatesGeneric(pllInstance *tr, partitionList *pr, double modelEpsilon, linkageList *ll);
+extern void pllOptBaseFreqs(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll);
+extern void pllOptAlphasGeneric(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll);
+extern void pllOptLG4X(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll, int numberOfModels);
+
+/* tree topology */
+void pllTreeInitTopologyNewick (pllInstance *, pllNewickTree *, int);
+void pllTreeInitTopologyRandom (pllInstance * tr, int tips, char ** nameList);
+void pllTreeInitTopologyForAlignment (pllInstance * tr, pllAlignmentData * alignmentData);
+extern void pllMakeRandomTree ( pllInstance *tr);
+void pllMakeParsimonyTree(pllInstance *tr);
+extern void pllMakeParsimonyTreeFast(pllInstance *tr, partitionList *pr, int sprDist);
+void pllComputeRandomizedStepwiseAdditionParsimonyTree(pllInstance * tr, partitionList * partitions, int sprDist);
+nodeptr pllGetRandomSubtree(pllInstance *);
+extern void pllFreeParsimonyDataStructures(pllInstance *tr, partitionList *pr);
+void pllDestroyInstance (pllInstance *);
+extern void pllGetAncestralState(pllInstance *tr, partitionList *pr, nodeptr p, double * outProbs, char * outSequence);
+unsigned int pllEvaluateParsimony(pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean full, pllBoolean perSiteScores);
+void pllInitParsimonyStructures(pllInstance *tr, partitionList *pr, pllBoolean perSiteScores);
+
+/* rearrange functions (NNI and SPR) */
+pllRearrangeList * pllCreateRearrangeList (int max);
+void pllDestroyRearrangeList (pllRearrangeList ** bestList);
+void pllRearrangeSearch (pllInstance * tr, partitionList * pr, int rearrangeType, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList);
+void pllRearrangeCommit (pllInstance * tr, partitionList * pr, pllRearrangeInfo * rearr, int saveRollbackInfo);
+int pllRearrangeRollback (pllInstance * tr, partitionList * pr);
+void pllClearRearrangeHistory (pllInstance * tr);
+int pllRaxmlSearchAlgorithm (pllInstance * tr, partitionList * pr, pllBoolean estimateModel);
+int pllGetTransitionMatrix (pllInstance * tr, partitionList * pr, nodeptr p, int model, int rate, double * outBuffer);
+void pllGetTransitionMatrix2 (pllInstance * tr, partitionList * pr, int model, nodeptr p, double * outBuffer);
+int pllGetCLV (pllInstance * tr, partitionList * pr, nodeptr p, int partition, double * outProbs);
+extern int pllTopologyPerformNNI(pllInstance * tr, nodeptr p, int swap);
+
+/* hash functions */
+unsigned int pllHashString (const char * s, unsigned int size);
+int pllHashAdd  (pllHashTable * hTable, unsigned int hash, const char * s, void * item);
+pllHashTable * pllHashInit (unsigned int n);
+int pllHashSearch (struct pllHashTable * hTable, char * s, void ** item);
+void pllHashDestroy (struct pllHashTable ** hTable, void (*cbDealloc)(void *));
+
+/* node specific functions */
+nodeptr pllGetOrientedNodePointer (pllInstance * pInst, nodeptr p);
+
+/* other functions */
+extern char * pllReadFile (const char *, long *);
+extern int * pllssort1main (char ** x, int n);
+extern node ** pllGetInnerBranchEndPoints (pllInstance * tr);
+
+/* ---------------- */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
diff --git a/pll/pllInternal.h b/pll/pllInternal.h
new file mode 100644
index 0000000..1b6e0ac
--- /dev/null
+++ b/pll/pllInternal.h
@@ -0,0 +1,313 @@
+/*
+ * pllInternal.h
+ *
+ *  Created on: Feb 17, 2014
+ *      Author: diego
+ */
+
+#ifndef PLLINTERNAL_H_
+#define PLLINTERNAL_H_
+
+#include "pll.h"
+#include "genericParallelization.h"
+#include "errcodes.h"
+#include "lexer.h"
+#include "parsePartition.h"
+#include "mem_alloc.h"
+
+//extern int lookupWord(char *s, stringHashtable *h);
+
+extern void getDataTypeString(pllInstance *tr, pInfo *partitionInfo, char typeOfData[1024]);
+extern int countTips(nodeptr p, int numsp);
+extern unsigned int precomputed16_bitcount(unsigned int n, char *bits_in_16bits);
+
+extern size_t discreteRateCategories(int rateHetModel);
+
+extern const partitionLengths * getPartitionLengths(pInfo *p);
+extern pllBoolean getSmoothFreqs(int dataType);
+extern const unsigned int *getBitVector(int dataType);
+extern int getUndetermined(int dataType);
+extern int getStates(int dataType);
+extern char getInverseMeaning(int dataType, unsigned char state);
+extern double gettime ( void );
+extern int gettimeSrand ( void );
+extern double randum ( long *seed );
+
+extern void getxnode ( nodeptr p );
+extern void hookup ( nodeptr p, nodeptr q, double *z, int numBranches);
+extern void hookupFull ( nodeptr p, nodeptr q, double *z);
+extern void hookupDefault ( nodeptr p, nodeptr q);
+extern pllBoolean whitechar ( int ch );
+extern void printLog ( pllInstance *tr);
+extern double LnGamma ( double alpha );
+extern double IncompleteGamma ( double x, double alpha, double ln_gamma_alpha );
+extern double PointNormal ( double prob );
+extern double PointChi2 ( double prob, double v );
+extern void initModel ( pllInstance *tr, double **empiricalFrequencies, partitionList * partitions);
+
+extern void resetBranches ( pllInstance *tr );
+extern void modOpt ( pllInstance *tr, partitionList *pr, double likelihoodEpsilon);
+
+extern void initializePartitionData(pllInstance *localTree, partitionList * localPartitions);
+extern void initMemorySavingAndRecom(pllInstance *tr, partitionList *pr);
+
+extern void nodeRectifier ( pllInstance *tr );
+extern void allocateParsimonyDataStructures(pllInstance *tr, partitionList *pr);
+
+extern FILE *myfopen(const char *path, const char *mode);
+
+extern pllBoolean initrav ( pllInstance *tr, partitionList *pr, nodeptr p );
+extern void initravPartition ( pllInstance *tr, nodeptr p, int model );
+extern void update ( pllInstance *tr, partitionList *pr, nodeptr p );
+extern void smooth ( pllInstance *tr, partitionList *pr, nodeptr p );
+extern void smoothTree ( pllInstance *tr, partitionList *pr, int maxtimes );
+extern void localSmooth ( pllInstance *tr, partitionList *pr, nodeptr p, int maxtimes );
+extern pllBoolean localSmoothMulti(pllInstance *tr, nodeptr p, int maxtimes, int model);
+
+extern void smoothRegion ( pllInstance *tr, partitionList *pr, nodeptr p, int region );
+extern void regionalSmooth ( pllInstance *tr, partitionList *pr, nodeptr p, int maxtimes, int region );
+extern nodeptr removeNodeBIG ( pllInstance *tr, partitionList *pr, nodeptr p, int numBranches);
+extern nodeptr removeNodeRestoreBIG ( pllInstance *tr, partitionList *pr, nodeptr p );
+extern pllBoolean insertBIG ( pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q);
+extern pllBoolean insertRestoreBIG ( pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q );
+extern pllBoolean testInsertBIG ( pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q );
+extern int NNI(pllInstance * tr, nodeptr p, int swap);
+extern void addTraverseBIG ( pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q, int mintrav, int maxtrav );
+extern int rearrangeBIG ( pllInstance *tr, partitionList *pr, nodeptr p, int mintrav, int maxtrav );
+extern void traversalOrder ( nodeptr p, int *count, nodeptr *nodeArray );
+extern pllBoolean testInsertRestoreBIG ( pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q );
+extern void restoreTreeFast ( pllInstance *tr, partitionList *pr );
+
+extern void initTL ( topolRELL_LIST *rl, pllInstance *tr, int n );
+extern void freeTL ( topolRELL_LIST *rl);
+extern void restoreTL ( topolRELL_LIST *rl, pllInstance *tr, int n, int numBranches );
+extern void resetTL ( topolRELL_LIST *rl );
+extern void saveTL ( topolRELL_LIST *rl, pllInstance *tr, int index );
+
+extern topol  *setupTopol (int maxtips);
+extern void saveTree (pllInstance *tr, topol *tpl, int numBranches);
+extern pllBoolean restoreTree (topol *tpl, pllInstance *tr, partitionList *pr);
+
+
+
+
+extern int  saveBestTree (bestlist *bt, pllInstance *tr, int numBranches);
+extern int  recallBestTree (bestlist *bt, int rank, pllInstance *tr, partitionList *pr);
+extern int initBestTree ( bestlist *bt, int newkeep, int numsp );
+extern void resetBestTree ( bestlist *bt );
+extern pllBoolean freeBestTree ( bestlist *bt );
+
+
+/* extern int treeReadLen (FILE *fp, pllInstance *tr, pllBoolean readBranches, pllBoolean readNodeLabels, pllBoolean topologyOnly);
+extern void getStartingTree (pllInstance *tr); 
+extern void treeReadTopologyString(char *treeString, pllInstance *tr);
+extern double treeLength (pllInstance *tr, int model);*/
+extern double evaluatePartialGeneric (pllInstance *, partitionList *pr, int i, double ki, int _model);
+extern void newviewAncestralIterative(pllInstance *tr, partitionList *pr);
+extern void printAncestralState(nodeptr p, pllBoolean printStates, pllBoolean printProbs, pllInstance *tr, partitionList *pr);
+extern void makenewzGeneric(pllInstance *tr, partitionList * pr, nodeptr p, nodeptr q, double *z0, int maxiter, double *result, pllBoolean mask);
+extern void makenewzGenericDistance(pllInstance *tr, int maxiter, double *z0, double *result, int taxon1, int taxon2);
+extern double evaluatePartitionGeneric (pllInstance *tr, nodeptr p, int model);
+extern void newviewPartitionGeneric (pllInstance *tr, nodeptr p, int model);
+extern double evaluateGenericVector (pllInstance *tr, nodeptr p);
+extern void categorizeGeneric (pllInstance *tr, nodeptr p);
+extern double makenewzPartitionGeneric(pllInstance *tr, nodeptr p, nodeptr q, double z0, int maxiter, int model);
+extern pllBoolean isTip(int number, int maxTips);
+
+/* recom functions */
+extern void computeTraversal(pllInstance *tr, nodeptr p, pllBoolean partialTraversal, int numBranches);
+extern void allocRecompVectorsInfo(pllInstance *tr);
+extern void allocTraversalCounter(pllInstance *tr);
+extern pllBoolean getxVector(recompVectors *rvec, int nodenum, int *slot, int mxtips);
+extern pllBoolean needsRecomp(pllBoolean recompute, recompVectors *rvec, nodeptr p, int mxtips);
+extern void unpinNode(recompVectors *v, int nodenum, int mxtips);
+extern void protectNode(recompVectors *rvec, int nodenum, int mxtips);
+
+/* Handling branch lengths*/
+extern void computeTraversalInfoStlen(nodeptr p, int maxTips, recompVectors *rvec, int *count);
+extern void computeFullTraversalInfoStlen(nodeptr p, int maxTips, recompVectors *rvec);
+extern void printTraversalInfo(pllInstance *tr);
+extern void countTraversal(pllInstance *tr);
+extern void storeExecuteMaskInTraversalDescriptor(pllInstance *tr, partitionList *pr);
+extern void storeValuesInTraversalDescriptor(pllInstance *tr, partitionList *pr, double *value);
+extern void makenewzIterative(pllInstance *, partitionList *pr);
+extern void execCore(pllInstance *, partitionList *pr, volatile double *dlnLdlz, volatile double *d2lnLdlz2);
+extern void makePermutation(int *perm, int n, pllInstance *tr);
+extern nodeptr findAnyTip(nodeptr p, int numsp);
+extern void putWAG(double *ext_initialRates);
+extern  unsigned int **initBitVector(int mxtips, unsigned int *vectorLength);
+//extern hashtable *initHashTable(unsigned int n);
+extern void cleanupHashTable(pllHashTable * h, int state);
+extern double convergenceCriterion(pllHashTable *h, int mxtips);
+extern void freeBitVectors(unsigned int **v, int n);
+//extern void freeHashTable(hashtable *h);
+//extern stringHashtable *initStringHashTable(hashNumberType n);
+//extern void addword(char *s, stringHashtable *h, int nodeNumber);
+extern void initRateMatrix(pllInstance *tr, partitionList *pr);
+extern void bitVectorInitravSpecial(unsigned int **bitVectors, nodeptr p, int numsp, unsigned int vectorLength, pllHashTable *h, int treeNumber, int function, branchInfo *bInf,
+                                    int *countBranches, int treeVectorLength, pllBoolean traverseOnly, pllBoolean computeWRF, int processID);
+extern  unsigned int bitcount_32_bit(unsigned int i);
+extern __inline unsigned int bitcount_64_bit(uint64_t i);
+extern void perSiteLogLikelihoods(pllInstance *tr, partitionList *pr, double *logLikelihoods);
+extern void updatePerSiteRates(pllInstance *tr, partitionList *pr, pllBoolean scaleRates);
+extern void restart(pllInstance *tr, partitionList *pr);
+
+//extern const unsigned int mask32[32];
+
+/** @brief Check whether the position \a pos in bitvector \a x is a gap
+
+    @param x
+      A bitvector represented by unsigned integers
+
+    @param pos
+      Position to check in \a x if it is set (i.e. it is a gap)
+
+    @return
+      Returns the value of the bit vector (\b 1 if set, \b 0 if not)
+*/
+//#ifndef __clang__
+//inline
+//#endif
+pllBoolean isGap(unsigned int *x, int pos);
+
+/** @brief Check whether the position \a pos in bitvector \a x is \b NOT a gap
+
+    @param x
+      A bitvector represented by unsigned integers
+
+    @param pos
+      Position to check in \a x if it is \b NOT set (i.e. it is \b NOT a gap)
+
+    @return
+      Returns the value of the bit vector (\b 1 if set, \b 0 if not)
+*/
+//#ifndef __clang__
+//inline
+//#endif
+pllBoolean noGap(unsigned int *x, int pos);
+
+//#ifndef __clang__
+//__inline
+//#endif
+//pllBoolean isGap(unsigned int *x, int pos);
+
+//#ifndef __clang__
+//__inline
+//#endif
+//pllBoolean noGap(unsigned int *x, int pos);
+
+/* from utils.h */
+linkageList* initLinkageList(int *linkList, partitionList *pr);
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS) )
+/* work tags for parallel regions */
+
+#define PLL_THREAD_NEWVIEW                  0
+#define PLL_THREAD_EVALUATE                 1
+#define PLL_THREAD_MAKENEWZ                 2
+#define PLL_THREAD_MAKENEWZ_FIRST           3
+#define PLL_THREAD_RATE_CATS                4
+#define PLL_THREAD_COPY_RATE_CATS           5
+#define PLL_THREAD_COPY_INIT_MODEL          6
+#define PLL_THREAD_INIT_PARTITION           7
+#define PLL_THREAD_OPT_ALPHA                8
+#define PLL_THREAD_OPT_RATE                 9
+#define PLL_THREAD_OPT_LG4X_RATE            10
+#define PLL_THREAD_COPY_ALPHA               11
+#define PLL_THREAD_COPY_RATES               12
+#define PLL_THREAD_COPY_LG4X_RATES          13
+#define PLL_THREAD_PER_SITE_LIKELIHOODS     14
+#define PLL_THREAD_NEWVIEW_ANCESTRAL        15
+#define PLL_THREAD_GATHER_ANCESTRAL         16
+#define PLL_THREAD_EXIT_GRACEFULLY          17
+#define PLL_THREAD_EVALUATE_PER_SITE_LIKES  18
+
+
+typedef struct
+{
+  pllInstance *tr;
+
+  partitionList *pr;
+  int threadNumber;
+}
+  threadData;
+extern void optRateCatPthreads(pllInstance *tr, partitionList *pr, double lower_spacing, double upper_spacing, double *lhs, int n, int tid);
+extern void pllMasterBarrier(pllInstance *, partitionList *, int);
+#endif
+
+
+#ifdef __AVX
+
+extern void newviewGTRGAMMAPROT_AVX_LG4(int tipCase,
+                                        double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
+                                        int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n,
+                                        double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+
+
+extern void newviewGTRCAT_AVX_GAPPED_SAVE(int tipCase,  double *EV,  int *cptr,
+                                   double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+                                   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                   int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+                                   unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                   double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats);
+
+extern void newviewGTRCATPROT_AVX_GAPPED_SAVE(int tipCase, double *extEV,
+                                       int *cptr,
+                                       double *x1, double *x2, double *x3, double *tipVector,
+                                       int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                       int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+                                       unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                       double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats);
+
+extern void  newviewGTRGAMMA_AVX_GAPPED_SAVE(int tipCase,
+                                      double *x1_start, double *x2_start, double *x3_start,
+                                      double *extEV, double *tipVector,
+                                      int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                      const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+                                      unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                      double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn
+                                      );
+
+extern void newviewGTRGAMMAPROT_AVX_GAPPED_SAVE(int tipCase,
+                                         double *x1_start, double *x2_start, double *x3_start, double *extEV, double *tipVector,
+                                         int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n,
+                                         double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+                                         unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                         double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn);
+
+extern void newviewGTRCAT_AVX(int tipCase,  double *EV,  int *cptr,
+    double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+    int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+    int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+
+
+extern void newviewGenericCATPROT_AVX(int tipCase, double *extEV,
+    int *cptr,
+    double *x1, double *x2, double *x3, double *tipVector,
+    int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+    int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+
+
+extern void newviewGTRGAMMA_AVX(int tipCase,
+    double *x1_start, double *x2_start, double *x3_start,
+    double *EV, double *tipVector,
+    int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+    const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+
+extern void newviewGTRGAMMAPROT_AVX(int tipCase,
+                             double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                             int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n,
+                             double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+
+extern void newviewGTRCATPROT_AVX(int tipCase, double *extEV,
+                           int *cptr,
+                           double *x1, double *x2, double *x3, double *tipVector,
+                           int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                           int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+
+#endif
+
+extern int virtual_width( int n );
+extern void computeAllAncestralVectors(nodeptr p, pllInstance *tr, partitionList *pr);
+
+#endif /* PLLINTERNAL_H_ */
diff --git a/pll/pthread.h b/pll/pthread.h
new file mode 100644
index 0000000..b4072f7
--- /dev/null
+++ b/pll/pthread.h
@@ -0,0 +1,1368 @@
+/* This is an implementation of the threads API of POSIX 1003.1-2001.
+ *
+ * --------------------------------------------------------------------------
+ *
+ *      Pthreads-win32 - POSIX Threads Library for Win32
+ *      Copyright(C) 1998 John E. Bossom
+ *      Copyright(C) 1999,2005 Pthreads-win32 contributors
+ * 
+ *      Contact Email: rpj at callisto.canberra.edu.au
+ * 
+ *      The current list of contributors is contained
+ *      in the file CONTRIBUTORS included with the source
+ *      code distribution. The list can also be seen at the
+ *      following World Wide Web location:
+ *      http://sources.redhat.com/pthreads-win32/contributors.html
+ * 
+ *      This library is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU Lesser General Public
+ *      License as published by the Free Software Foundation; either
+ *      version 2 of the License, or (at your option) any later version.
+ * 
+ *      This library is distributed in the hope that it will be useful,
+ *      but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *      Lesser General Public License for more details.
+ * 
+ *      You should have received a copy of the GNU Lesser General Public
+ *      License along with this library in the file COPYING.LIB;
+ *      if not, write to the Free Software Foundation, Inc.,
+ *      59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#if !defined( PTHREAD_H )
+#define PTHREAD_H
+
+/*
+ * See the README file for an explanation of the pthreads-win32 version
+ * numbering scheme and how the DLL is named etc.
+ */
+#define PTW32_VERSION 2,9,1,0
+#define PTW32_VERSION_STRING "2, 9, 1, 0\0"
+
+/* There are three implementations of cancel cleanup.
+ * Note that pthread.h is included in both application
+ * compilation units and also internally for the library.
+ * The code here and within the library aims to work
+ * for all reasonable combinations of environments.
+ *
+ * The three implementations are:
+ *
+ *   WIN32 SEH
+ *   C
+ *   C++
+ *
+ * Please note that exiting a push/pop block via
+ * "return", "exit", "break", or "continue" will
+ * lead to different behaviour amongst applications
+ * depending upon whether the library was built
+ * using SEH, C++, or C. For example, a library built
+ * with SEH will call the cleanup routine, while both
+ * C++ and C built versions will not.
+ */
+
+/*
+ * Define defaults for cleanup code.
+ * Note: Unless the build explicitly defines one of the following, then
+ * we default to standard C style cleanup. This style uses setjmp/longjmp
+ * in the cancelation and thread exit implementations and therefore won't
+ * do stack unwinding if linked to applications that have it (e.g.
+ * C++ apps). This is currently consistent with most/all commercial Unix
+ * POSIX threads implementations.
+ */
+#if !defined( __CLEANUP_SEH ) && !defined( __CLEANUP_CXX ) && !defined( __CLEANUP_C )
+# define __CLEANUP_C
+#endif
+
+#if defined( __CLEANUP_SEH ) && ( !defined( _MSC_VER ) && !defined(PTW32_RC_MSC))
+#error ERROR [__FILE__, line __LINE__]: SEH is not supported for this compiler.
+#endif
+
+/*
+ * Stop here if we are being included by the resource compiler.
+ */
+#if !defined(RC_INVOKED)
+
+#undef PTW32_LEVEL
+
+#if defined(_POSIX_SOURCE)
+#define PTW32_LEVEL 0
+/* Early POSIX */
+#endif
+
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309
+#undef PTW32_LEVEL
+#define PTW32_LEVEL 1
+/* Include 1b, 1c and 1d */
+#endif
+
+#if defined(INCLUDE_NP)
+#undef PTW32_LEVEL
+#define PTW32_LEVEL 2
+/* Include Non-Portable extensions */
+#endif
+
+#define PTW32_LEVEL_MAX 3
+
+#if ( defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112 )  || !defined(PTW32_LEVEL)
+#define PTW32_LEVEL PTW32_LEVEL_MAX
+/* Include everything */
+#endif
+
+#if defined(_UWIN)
+#   define HAVE_STRUCT_TIMESPEC 1
+#   define HAVE_SIGNAL_H        1
+#   undef HAVE_PTW32_CONFIG_H
+#   pragma comment(lib, "pthread")
+#endif
+
+/*
+ * -------------------------------------------------------------
+ *
+ *
+ * Module: pthread.h
+ *
+ * Purpose:
+ *      Provides an implementation of PThreads based upon the
+ *      standard:
+ *
+ *              POSIX 1003.1-2001
+ *  and
+ *    The Single Unix Specification version 3
+ *
+ *    (these two are equivalent)
+ *
+ *      in order to enhance code portability between Windows,
+ *  various commercial Unix implementations, and Linux.
+ *
+ *      See the ANNOUNCE file for a full list of conforming
+ *      routines and defined constants, and a list of missing
+ *      routines and constants not defined in this implementation.
+ *
+ * Authors:
+ *      There have been many contributors to this library.
+ *      The initial implementation was contributed by
+ *      John Bossom, and several others have provided major
+ *      sections or revisions of parts of the implementation.
+ *      Often significant effort has been contributed to
+ *      find and fix important bugs and other problems to
+ *      improve the reliability of the library, which sometimes
+ *      is not reflected in the amount of code which changed as
+ *      result.
+ *      As much as possible, the contributors are acknowledged
+ *      in the ChangeLog file in the source code distribution
+ *      where their changes are noted in detail.
+ *
+ *      Contributors are listed in the CONTRIBUTORS file.
+ *
+ *      As usual, all bouquets go to the contributors, and all
+ *      brickbats go to the project maintainer.
+ *
+ * Maintainer:
+ *      The code base for this project is coordinated and
+ *      eventually pre-tested, packaged, and made available by
+ *
+ *              Ross Johnson <rpj at callisto.canberra.edu.au>
+ *
+ * QA Testers:
+ *      Ultimately, the library is tested in the real world by
+ *      a host of competent and demanding scientists and
+ *      engineers who report bugs and/or provide solutions
+ *      which are then fixed or incorporated into subsequent
+ *      versions of the library. Each time a bug is fixed, a
+ *      test case is written to prove the fix and ensure
+ *      that later changes to the code don't reintroduce the
+ *      same error. The number of test cases is slowly growing
+ *      and therefore so is the code reliability.
+ *
+ * Compliance:
+ *      See the file ANNOUNCE for the list of implemented
+ *      and not-implemented routines and defined options.
+ *      Of course, these are all defined is this file as well.
+ *
+ * Web site:
+ *      The source code and other information about this library
+ *      are available from
+ *
+ *              http://sources.redhat.com/pthreads-win32/
+ *
+ * -------------------------------------------------------------
+ */
+
+/* Try to avoid including windows.h */
+#if (defined(__MINGW64__) || defined(__MINGW32__)) && defined(__cplusplus)
+#define PTW32_INCLUDE_WINDOWS_H
+#endif
+
+#if defined(PTW32_INCLUDE_WINDOWS_H)
+#include <windows.h>
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER < 1300 || defined(__DMC__)
+/*
+ * VC++6.0 or early compiler's header has no DWORD_PTR type.
+ */
+typedef unsigned long DWORD_PTR;
+typedef unsigned long ULONG_PTR;
+#endif
+/*
+ * -----------------
+ * autoconf switches
+ * -----------------
+ */
+
+#if defined(HAVE_PTW32_CONFIG_H)
+#include "config.h"
+#endif /* HAVE_PTW32_CONFIG_H */
+
+#if !defined(NEED_FTIME)
+#include <time.h>
+#else /* NEED_FTIME */
+/* use native WIN32 time API */
+#endif /* NEED_FTIME */
+
+#if defined(HAVE_SIGNAL_H)
+#include <signal.h>
+#endif /* HAVE_SIGNAL_H */
+
+#include <limits.h>
+
+/*
+ * Boolean values to make us independent of system includes.
+ */
+enum {
+  PTW32_FALSE = 0,
+  PTW32_TRUE = (! PTW32_FALSE)
+};
+
+/*
+ * This is a duplicate of what is in the autoconf config.h,
+ * which is only used when building the pthread-win32 libraries.
+ */
+
+#if !defined(PTW32_CONFIG_H)
+#  if defined(WINCE)
+#    define NEED_ERRNO
+#    define NEED_SEM
+#  endif
+#  if defined(__MINGW64__)
+#    define HAVE_STRUCT_TIMESPEC
+#    define HAVE_MODE_T
+#  elif defined(_UWIN) || defined(__MINGW32__)
+#    define HAVE_MODE_T
+#  endif
+#endif
+
+/*
+ *
+ */
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+#if defined(NEED_ERRNO)
+#include "need_errno.h"
+#else
+#include <errno.h>
+#endif
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+/*
+ * Several systems don't define some error numbers.
+ */
+#if !defined(ENOTSUP)
+#  define ENOTSUP 48   /* This is the value in Solaris. */
+#endif
+
+#if !defined(ETIMEDOUT)
+#  define ETIMEDOUT 10060 /* Same as WSAETIMEDOUT */
+#endif
+
+#if !defined(ENOSYS)
+#  define ENOSYS 140     /* Semi-arbitrary value */
+#endif
+
+#if !defined(EDEADLK)
+#  if defined(EDEADLOCK)
+#    define EDEADLK EDEADLOCK
+#  else
+#    define EDEADLK 36     /* This is the value in MSVC. */
+#  endif
+#endif
+
+/* POSIX 2008 - related to robust mutexes */
+#if !defined(EOWNERDEAD)
+#  define EOWNERDEAD 43
+#endif
+#if !defined(ENOTRECOVERABLE)
+#  define ENOTRECOVERABLE 44
+#endif
+
+#include <sched.h>
+
+/*
+ * To avoid including windows.h we define only those things that we
+ * actually need from it.
+ */
+#if !defined(PTW32_INCLUDE_WINDOWS_H)
+#if !defined(HANDLE)
+# define PTW32__HANDLE_DEF
+# define HANDLE void *
+#endif
+#if !defined(DWORD)
+# define PTW32__DWORD_DEF
+# define DWORD unsigned long
+#endif
+#endif
+
+#if !defined(HAVE_STRUCT_TIMESPEC)
+#define HAVE_STRUCT_TIMESPEC
+#if !defined(_TIMESPEC_DEFINED)
+#define _TIMESPEC_DEFINED
+struct timespec {
+        time_t tv_sec;
+        long tv_nsec;
+};
+#endif /* _TIMESPEC_DEFINED */
+#endif /* HAVE_STRUCT_TIMESPEC */
+
+#if !defined(SIG_BLOCK)
+#define SIG_BLOCK 0
+#endif /* SIG_BLOCK */
+
+#if !defined(SIG_UNBLOCK)
+#define SIG_UNBLOCK 1
+#endif /* SIG_UNBLOCK */
+
+#if !defined(SIG_SETMASK)
+#define SIG_SETMASK 2
+#endif /* SIG_SETMASK */
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif                          /* __cplusplus */
+
+/*
+ * -------------------------------------------------------------
+ *
+ * POSIX 1003.1-2001 Options
+ * =========================
+ *
+ * Options are normally set in <unistd.h>, which is not provided
+ * with pthreads-win32.
+ *
+ * For conformance with the Single Unix Specification (version 3), all of the
+ * options below are defined, and have a value of either -1 (not supported)
+ * or 200112L (supported).
+ *
+ * These options can neither be left undefined nor have a value of 0, because
+ * either indicates that sysconf(), which is not implemented, may be used at
+ * runtime to check the status of the option.
+ *
+ * _POSIX_THREADS (== 200112L)
+ *                      If == 200112L, you can use threads
+ *
+ * _POSIX_THREAD_ATTR_STACKSIZE (== 200112L)
+ *                      If == 200112L, you can control the size of a thread's
+ *                      stack
+ *                              pthread_attr_getstacksize
+ *                              pthread_attr_setstacksize
+ *
+ * _POSIX_THREAD_ATTR_STACKADDR (== -1)
+ *                      If == 200112L, you can allocate and control a thread's
+ *                      stack. If not supported, the following functions
+ *                      will return ENOSYS, indicating they are not
+ *                      supported:
+ *                              pthread_attr_getstackaddr
+ *                              pthread_attr_setstackaddr
+ *
+ * _POSIX_THREAD_PRIORITY_SCHEDULING (== -1)
+ *                      If == 200112L, you can use realtime scheduling.
+ *                      This option indicates that the behaviour of some
+ *                      implemented functions conforms to the additional TPS
+ *                      requirements in the standard. E.g. rwlocks favour
+ *                      writers over readers when threads have equal priority.
+ *
+ * _POSIX_THREAD_PRIO_INHERIT (== -1)
+ *                      If == 200112L, you can create priority inheritance
+ *                      mutexes.
+ *                              pthread_mutexattr_getprotocol +
+ *                              pthread_mutexattr_setprotocol +
+ *
+ * _POSIX_THREAD_PRIO_PROTECT (== -1)
+ *                      If == 200112L, you can create priority ceiling mutexes
+ *                      Indicates the availability of:
+ *                              pthread_mutex_getprioceiling
+ *                              pthread_mutex_setprioceiling
+ *                              pthread_mutexattr_getprioceiling
+ *                              pthread_mutexattr_getprotocol     +
+ *                              pthread_mutexattr_setprioceiling
+ *                              pthread_mutexattr_setprotocol     +
+ *
+ * _POSIX_THREAD_PROCESS_SHARED (== -1)
+ *                      If set, you can create mutexes and condition
+ *                      variables that can be shared with another
+ *                      process.If set, indicates the availability
+ *                      of:
+ *                              pthread_mutexattr_getpshared
+ *                              pthread_mutexattr_setpshared
+ *                              pthread_condattr_getpshared
+ *                              pthread_condattr_setpshared
+ *
+ * _POSIX_THREAD_SAFE_FUNCTIONS (== 200112L)
+ *                      If == 200112L you can use the special *_r library
+ *                      functions that provide thread-safe behaviour
+ *
+ * _POSIX_READER_WRITER_LOCKS (== 200112L)
+ *                      If == 200112L, you can use read/write locks
+ *
+ * _POSIX_SPIN_LOCKS (== 200112L)
+ *                      If == 200112L, you can use spin locks
+ *
+ * _POSIX_BARRIERS (== 200112L)
+ *                      If == 200112L, you can use barriers
+ *
+ *      + These functions provide both 'inherit' and/or
+ *        'protect' protocol, based upon these macro
+ *        settings.
+ *
+ * -------------------------------------------------------------
+ */
+
+/*
+ * POSIX Options
+ */
+#undef _POSIX_THREADS
+#define _POSIX_THREADS 200809L
+
+#undef _POSIX_READER_WRITER_LOCKS
+#define _POSIX_READER_WRITER_LOCKS 200809L
+
+#undef _POSIX_SPIN_LOCKS
+#define _POSIX_SPIN_LOCKS 200809L
+
+#undef _POSIX_BARRIERS
+#define _POSIX_BARRIERS 200809L
+
+#undef _POSIX_THREAD_SAFE_FUNCTIONS
+#define _POSIX_THREAD_SAFE_FUNCTIONS 200809L
+
+#undef _POSIX_THREAD_ATTR_STACKSIZE
+#define _POSIX_THREAD_ATTR_STACKSIZE 200809L
+
+/*
+ * The following options are not supported
+ */
+#undef _POSIX_THREAD_ATTR_STACKADDR
+#define _POSIX_THREAD_ATTR_STACKADDR -1
+
+#undef _POSIX_THREAD_PRIO_INHERIT
+#define _POSIX_THREAD_PRIO_INHERIT -1
+
+#undef _POSIX_THREAD_PRIO_PROTECT
+#define _POSIX_THREAD_PRIO_PROTECT -1
+
+/* TPS is not fully supported.  */
+#undef _POSIX_THREAD_PRIORITY_SCHEDULING
+#define _POSIX_THREAD_PRIORITY_SCHEDULING -1
+
+#undef _POSIX_THREAD_PROCESS_SHARED
+#define _POSIX_THREAD_PROCESS_SHARED -1
+
+
+/*
+ * POSIX 1003.1-2001 Limits
+ * ===========================
+ *
+ * These limits are normally set in <limits.h>, which is not provided with
+ * pthreads-win32.
+ *
+ * PTHREAD_DESTRUCTOR_ITERATIONS
+ *                      Maximum number of attempts to destroy
+ *                      a thread's thread-specific data on
+ *                      termination (must be at least 4)
+ *
+ * PTHREAD_KEYS_MAX
+ *                      Maximum number of thread-specific data keys
+ *                      available per process (must be at least 128)
+ *
+ * PTHREAD_STACK_MIN
+ *                      Minimum supported stack size for a thread
+ *
+ * PTHREAD_THREADS_MAX
+ *                      Maximum number of threads supported per
+ *                      process (must be at least 64).
+ *
+ * SEM_NSEMS_MAX
+ *                      The maximum number of semaphores a process can have.
+ *                      (must be at least 256)
+ *
+ * SEM_VALUE_MAX
+ *                      The maximum value a semaphore can have.
+ *                      (must be at least 32767)
+ *
+ */
+#undef _POSIX_THREAD_DESTRUCTOR_ITERATIONS
+#define _POSIX_THREAD_DESTRUCTOR_ITERATIONS     4
+
+#undef PTHREAD_DESTRUCTOR_ITERATIONS
+#define PTHREAD_DESTRUCTOR_ITERATIONS           _POSIX_THREAD_DESTRUCTOR_ITERATIONS
+
+#undef _POSIX_THREAD_KEYS_MAX
+#define _POSIX_THREAD_KEYS_MAX                  128
+
+#undef PTHREAD_KEYS_MAX
+#define PTHREAD_KEYS_MAX                        _POSIX_THREAD_KEYS_MAX
+
+#undef PTHREAD_STACK_MIN
+#define PTHREAD_STACK_MIN                       0
+
+#undef _POSIX_THREAD_THREADS_MAX
+#define _POSIX_THREAD_THREADS_MAX               64
+
+  /* Arbitrary value */
+#undef PTHREAD_THREADS_MAX
+#define PTHREAD_THREADS_MAX                     2019
+
+#undef _POSIX_SEM_NSEMS_MAX
+#define _POSIX_SEM_NSEMS_MAX                    256
+
+  /* Arbitrary value */
+#undef SEM_NSEMS_MAX
+#define SEM_NSEMS_MAX                           1024
+
+#undef _POSIX_SEM_VALUE_MAX
+#define _POSIX_SEM_VALUE_MAX                    32767
+
+#undef SEM_VALUE_MAX
+#define SEM_VALUE_MAX                           INT_MAX
+
+
+#if defined(__GNUC__) && !defined(__declspec)
+# error Please upgrade your GNU compiler to one that supports __declspec.
+#endif
+
+/*
+ * When building the library, you should define PTW32_BUILD so that
+ * the variables/functions are exported correctly. When using the library,
+ * do NOT define PTW32_BUILD, and then the variables/functions will
+ * be imported correctly.
+ */
+#if !defined(PTW32_STATIC_LIB)
+#  if defined(PTW32_BUILD)
+#    define PTW32_DLLPORT __declspec (dllexport)
+#  else
+#    define PTW32_DLLPORT __declspec (dllimport)
+#  endif
+#else
+#  define PTW32_DLLPORT
+#endif
+
+/*
+ * The Open Watcom C/C++ compiler uses a non-standard calling convention
+ * that passes function args in registers unless __cdecl is explicitly specified
+ * in exposed function prototypes.
+ *
+ * We force all calls to cdecl even though this could slow Watcom code down
+ * slightly. If you know that the Watcom compiler will be used to build both
+ * the DLL and application, then you can probably define this as a null string.
+ * Remember that pthread.h (this file) is used for both the DLL and application builds.
+ */
+#define PTW32_CDECL __cdecl
+
+#if defined(_UWIN) && PTW32_LEVEL >= PTW32_LEVEL_MAX
+#   include     <sys/types.h>
+#else
+/*
+ * Generic handle type - intended to extend uniqueness beyond
+ * that available with a simple pointer. It should scale for either
+ * IA-32 or IA-64.
+ */
+typedef struct {
+    void * p;                   /* Pointer to actual object */
+    unsigned int x;             /* Extra information - reuse count etc */
+} ptw32_handle_t;
+
+typedef ptw32_handle_t pthread_t;
+typedef struct pthread_attr_t_ * pthread_attr_t;
+typedef struct pthread_once_t_ pthread_once_t;
+typedef struct pthread_key_t_ * pthread_key_t;
+typedef struct pthread_mutex_t_ * pthread_mutex_t;
+typedef struct pthread_mutexattr_t_ * pthread_mutexattr_t;
+typedef struct pthread_cond_t_ * pthread_cond_t;
+typedef struct pthread_condattr_t_ * pthread_condattr_t;
+#endif
+typedef struct pthread_rwlock_t_ * pthread_rwlock_t;
+typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t;
+typedef struct pthread_spinlock_t_ * pthread_spinlock_t;
+typedef struct pthread_barrier_t_ * pthread_barrier_t;
+typedef struct pthread_barrierattr_t_ * pthread_barrierattr_t;
+
+/*
+ * ====================
+ * ====================
+ * POSIX Threads
+ * ====================
+ * ====================
+ */
+
+enum {
+/*
+ * pthread_attr_{get,set}detachstate
+ */
+  PTHREAD_CREATE_JOINABLE       = 0,  /* Default */
+  PTHREAD_CREATE_DETACHED       = 1,
+
+/*
+ * pthread_attr_{get,set}inheritsched
+ */
+  PTHREAD_INHERIT_SCHED         = 0,
+  PTHREAD_EXPLICIT_SCHED        = 1,  /* Default */
+
+/*
+ * pthread_{get,set}scope
+ */
+  PTHREAD_SCOPE_PROCESS         = 0,
+  PTHREAD_SCOPE_SYSTEM          = 1,  /* Default */
+
+/*
+ * pthread_setcancelstate paramters
+ */
+  PTHREAD_CANCEL_ENABLE         = 0,  /* Default */
+  PTHREAD_CANCEL_DISABLE        = 1,
+
+/*
+ * pthread_setcanceltype parameters
+ */
+  PTHREAD_CANCEL_ASYNCHRONOUS   = 0,
+  PTHREAD_CANCEL_DEFERRED       = 1,  /* Default */
+
+/*
+ * pthread_mutexattr_{get,set}pshared
+ * pthread_condattr_{get,set}pshared
+ */
+  PTHREAD_PROCESS_PRIVATE       = 0,
+  PTHREAD_PROCESS_SHARED        = 1,
+
+/*
+ * pthread_mutexattr_{get,set}robust
+ */
+  PTHREAD_MUTEX_STALLED         = 0,  /* Default */
+  PTHREAD_MUTEX_ROBUST          = 1,
+
+/*
+ * pthread_barrier_wait
+ */
+  PTHREAD_BARRIER_SERIAL_THREAD = -1
+};
+
+/*
+ * ====================
+ * ====================
+ * Cancelation
+ * ====================
+ * ====================
+ */
+#define PTHREAD_CANCELED       ((void *)(size_t) -1)
+
+
+/*
+ * ====================
+ * ====================
+ * Once Key
+ * ====================
+ * ====================
+ */
+#define PTHREAD_ONCE_INIT       { PTW32_FALSE, 0, 0, 0}
+
+struct pthread_once_t_
+{
+  int          done;        /* indicates if user function has been executed */
+  void *       lock;
+  int          reserved1;
+  int          reserved2;
+};
+
+
+/*
+ * ====================
+ * ====================
+ * Object initialisers
+ * ====================
+ * ====================
+ */
+#define PTHREAD_MUTEX_INITIALIZER ((pthread_mutex_t)(size_t) -1)
+#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER ((pthread_mutex_t)(size_t) -2)
+#define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER ((pthread_mutex_t)(size_t) -3)
+
+/*
+ * Compatibility with LinuxThreads
+ */
+#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP PTHREAD_RECURSIVE_MUTEX_INITIALIZER
+#define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP PTHREAD_ERRORCHECK_MUTEX_INITIALIZER
+
+#define PTHREAD_COND_INITIALIZER ((pthread_cond_t)(size_t) -1)
+
+#define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t)(size_t) -1)
+
+#define PTHREAD_SPINLOCK_INITIALIZER ((pthread_spinlock_t)(size_t) -1)
+
+
+/*
+ * Mutex types.
+ */
+enum
+{
+  /* Compatibility with LinuxThreads */
+  PTHREAD_MUTEX_FAST_NP,
+  PTHREAD_MUTEX_RECURSIVE_NP,
+  PTHREAD_MUTEX_ERRORCHECK_NP,
+  PTHREAD_MUTEX_TIMED_NP = PTHREAD_MUTEX_FAST_NP,
+  PTHREAD_MUTEX_ADAPTIVE_NP = PTHREAD_MUTEX_FAST_NP,
+  /* For compatibility with POSIX */
+  PTHREAD_MUTEX_NORMAL = PTHREAD_MUTEX_FAST_NP,
+  PTHREAD_MUTEX_RECURSIVE = PTHREAD_MUTEX_RECURSIVE_NP,
+  PTHREAD_MUTEX_ERRORCHECK = PTHREAD_MUTEX_ERRORCHECK_NP,
+  PTHREAD_MUTEX_DEFAULT = PTHREAD_MUTEX_NORMAL
+};
+
+
+typedef struct ptw32_cleanup_t ptw32_cleanup_t;
+
+#if defined(_MSC_VER)
+/* Disable MSVC 'anachronism used' warning */
+#pragma warning( disable : 4229 )
+#endif
+
+typedef void (* PTW32_CDECL ptw32_cleanup_callback_t)(void *);
+
+#if defined(_MSC_VER)
+#pragma warning( default : 4229 )
+#endif
+
+struct ptw32_cleanup_t
+{
+  ptw32_cleanup_callback_t routine;
+  void *arg;
+  struct ptw32_cleanup_t *prev;
+};
+
+#if defined(__CLEANUP_SEH)
+        /*
+         * WIN32 SEH version of cancel cleanup.
+         */
+
+#define pthread_cleanup_push( _rout, _arg ) \
+        { \
+            ptw32_cleanup_t     _cleanup; \
+            \
+        _cleanup.routine        = (ptw32_cleanup_callback_t)(_rout); \
+            _cleanup.arg        = (_arg); \
+            __try \
+              { \
+
+#define pthread_cleanup_pop( _execute ) \
+              } \
+            __finally \
+                { \
+                    if( _execute || AbnormalTermination()) \
+                      { \
+                          (*(_cleanup.routine))( _cleanup.arg ); \
+                      } \
+                } \
+        }
+
+#else /* __CLEANUP_SEH */
+
+#if defined(__CLEANUP_C)
+
+        /*
+         * C implementation of PThreads cancel cleanup
+         */
+
+#define pthread_cleanup_push( _rout, _arg ) \
+        { \
+            ptw32_cleanup_t     _cleanup; \
+            \
+            ptw32_push_cleanup( &_cleanup, (ptw32_cleanup_callback_t) (_rout), (_arg) ); \
+
+#define pthread_cleanup_pop( _execute ) \
+            (void) ptw32_pop_cleanup( _execute ); \
+        }
+
+#else /* __CLEANUP_C */
+
+#if defined(__CLEANUP_CXX)
+
+        /*
+         * C++ version of cancel cleanup.
+         * - John E. Bossom.
+         */
+
+        class PThreadCleanup {
+          /*
+           * PThreadCleanup
+           *
+           * Purpose
+           *      This class is a C++ helper class that is
+           *      used to implement pthread_cleanup_push/
+           *      pthread_cleanup_pop.
+           *      The destructor of this class automatically
+           *      pops the pushed cleanup routine regardless
+           *      of how the code exits the scope
+           *      (i.e. such as by an exception)
+           */
+      ptw32_cleanup_callback_t cleanUpRout;
+          void    *       obj;
+          int             executeIt;
+
+        public:
+          PThreadCleanup() :
+            cleanUpRout( 0 ),
+            obj( 0 ),
+            executeIt( 0 )
+            /*
+             * No cleanup performed
+             */
+            {
+            }
+
+          PThreadCleanup(
+             ptw32_cleanup_callback_t routine,
+                         void    *       arg ) :
+            cleanUpRout( routine ),
+            obj( arg ),
+            executeIt( 1 )
+            /*
+             * Registers a cleanup routine for 'arg'
+             */
+            {
+            }
+
+          ~PThreadCleanup()
+            {
+              if ( executeIt && ((void *) cleanUpRout != (void *) 0) )
+                {
+                  (void) (*cleanUpRout)( obj );
+                }
+            }
+
+          void execute( int exec )
+            {
+              executeIt = exec;
+            }
+        };
+
+        /*
+         * C++ implementation of PThreads cancel cleanup;
+         * This implementation takes advantage of a helper
+         * class who's destructor automatically calls the
+         * cleanup routine if we exit our scope weirdly
+         */
+#define pthread_cleanup_push( _rout, _arg ) \
+        { \
+            PThreadCleanup  cleanup((ptw32_cleanup_callback_t)(_rout), \
+                                    (void *) (_arg) );
+
+#define pthread_cleanup_pop( _execute ) \
+            cleanup.execute( _execute ); \
+        }
+
+#else
+
+#error ERROR [__FILE__, line __LINE__]: Cleanup type undefined.
+
+#endif /* __CLEANUP_CXX */
+
+#endif /* __CLEANUP_C */
+
+#endif /* __CLEANUP_SEH */
+
+/*
+ * ===============
+ * ===============
+ * Methods
+ * ===============
+ * ===============
+ */
+
+/*
+ * PThread Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_init (pthread_attr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_destroy (pthread_attr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getdetachstate (const pthread_attr_t * attr,
+                                         int *detachstate);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getstackaddr (const pthread_attr_t * attr,
+                                       void **stackaddr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getstacksize (const pthread_attr_t * attr,
+                                       size_t * stacksize);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setdetachstate (pthread_attr_t * attr,
+                                         int detachstate);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setstackaddr (pthread_attr_t * attr,
+                                       void *stackaddr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setstacksize (pthread_attr_t * attr,
+                                       size_t stacksize);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getschedparam (const pthread_attr_t *attr,
+                                        struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setschedparam (pthread_attr_t *attr,
+                                        const struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setschedpolicy (pthread_attr_t *,
+                                         int);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getschedpolicy (const pthread_attr_t *,
+                                         int *);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setinheritsched(pthread_attr_t * attr,
+                                         int inheritsched);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getinheritsched(const pthread_attr_t * attr,
+                                         int * inheritsched);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setscope (pthread_attr_t *,
+                                   int);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getscope (const pthread_attr_t *,
+                                   int *);
+
+/*
+ * PThread Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_create (pthread_t * tid,
+                            const pthread_attr_t * attr,
+                            void *(PTW32_CDECL *start) (void *),
+                            void *arg);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_detach (pthread_t tid);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_equal (pthread_t t1,
+                           pthread_t t2);
+
+PTW32_DLLPORT void PTW32_CDECL pthread_exit (void *value_ptr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_join (pthread_t thread,
+                          void **value_ptr);
+
+PTW32_DLLPORT pthread_t PTW32_CDECL pthread_self (void);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cancel (pthread_t thread);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setcancelstate (int state,
+                                    int *oldstate);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setcanceltype (int type,
+                                   int *oldtype);
+
+PTW32_DLLPORT void PTW32_CDECL pthread_testcancel (void);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_once (pthread_once_t * once_control,
+                          void (PTW32_CDECL *init_routine) (void));
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+PTW32_DLLPORT ptw32_cleanup_t * PTW32_CDECL ptw32_pop_cleanup (int execute);
+
+PTW32_DLLPORT void PTW32_CDECL ptw32_push_cleanup (ptw32_cleanup_t * cleanup,
+                                 ptw32_cleanup_callback_t routine,
+                                 void *arg);
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+/*
+ * Thread Specific Data Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_key_create (pthread_key_t * key,
+                                void (PTW32_CDECL *destructor) (void *));
+
+PTW32_DLLPORT int PTW32_CDECL pthread_key_delete (pthread_key_t key);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setspecific (pthread_key_t key,
+                                 const void *value);
+
+PTW32_DLLPORT void * PTW32_CDECL pthread_getspecific (pthread_key_t key);
+
+
+/*
+ * Mutex Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_init (pthread_mutexattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_destroy (pthread_mutexattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_getpshared (const pthread_mutexattr_t
+                                          * attr,
+                                          int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_setpshared (pthread_mutexattr_t * attr,
+                                          int pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_settype (pthread_mutexattr_t * attr, int kind);
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_gettype (const pthread_mutexattr_t * attr, int *kind);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_setrobust(
+                                           pthread_mutexattr_t *attr,
+                                           int robust);
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_getrobust(
+                                           const pthread_mutexattr_t * attr,
+                                           int * robust);
+
+/*
+ * Barrier Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_init (pthread_barrierattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_destroy (pthread_barrierattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_getpshared (const pthread_barrierattr_t
+                                            * attr,
+                                            int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_setpshared (pthread_barrierattr_t * attr,
+                                            int pshared);
+
+/*
+ * Mutex Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_init (pthread_mutex_t * mutex,
+                                const pthread_mutexattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_destroy (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_lock (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_timedlock(pthread_mutex_t * mutex,
+                                    const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_trylock (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_unlock (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_consistent (pthread_mutex_t * mutex);
+
+/*
+ * Spinlock Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_init (pthread_spinlock_t * lock, int pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_destroy (pthread_spinlock_t * lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_lock (pthread_spinlock_t * lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_trylock (pthread_spinlock_t * lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_unlock (pthread_spinlock_t * lock);
+
+/*
+ * Barrier Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_barrier_init (pthread_barrier_t * barrier,
+                                  const pthread_barrierattr_t * attr,
+                                  unsigned int count);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrier_destroy (pthread_barrier_t * barrier);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrier_wait (pthread_barrier_t * barrier);
+
+/*
+ * Condition Variable Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_init (pthread_condattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_destroy (pthread_condattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_getpshared (const pthread_condattr_t * attr,
+                                         int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_setpshared (pthread_condattr_t * attr,
+                                         int pshared);
+
+/*
+ * Condition Variable Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_init (pthread_cond_t * cond,
+                               const pthread_condattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_destroy (pthread_cond_t * cond);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_wait (pthread_cond_t * cond,
+                               pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_timedwait (pthread_cond_t * cond,
+                                    pthread_mutex_t * mutex,
+                                    const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_signal (pthread_cond_t * cond);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_broadcast (pthread_cond_t * cond);
+
+/*
+ * Scheduling
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_setschedparam (pthread_t thread,
+                                   int policy,
+                                   const struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_getschedparam (pthread_t thread,
+                                   int *policy,
+                                   struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setconcurrency (int);
+ 
+PTW32_DLLPORT int PTW32_CDECL pthread_getconcurrency (void);
+
+/*
+ * Read-Write Lock Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_init(pthread_rwlock_t *lock,
+                                const pthread_rwlockattr_t *attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_destroy(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_tryrdlock(pthread_rwlock_t *);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_trywrlock(pthread_rwlock_t *);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_rdlock(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_timedrdlock(pthread_rwlock_t *lock,
+                                       const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_wrlock(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_timedwrlock(pthread_rwlock_t *lock,
+                                       const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_unlock(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_init (pthread_rwlockattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_destroy (pthread_rwlockattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_getpshared (const pthread_rwlockattr_t * attr,
+                                           int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_setpshared (pthread_rwlockattr_t * attr,
+                                           int pshared);
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX - 1
+
+/*
+ * Signal Functions. Should be defined in <signal.h> but MSVC and MinGW32
+ * already have signal.h that don't define these.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_kill(pthread_t thread, int sig);
+
+/*
+ * Non-portable functions
+ */
+
+/*
+ * Compatibility with Linux.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_setkind_np(pthread_mutexattr_t * attr,
+                                         int kind);
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_getkind_np(pthread_mutexattr_t * attr,
+                                         int *kind);
+
+/*
+ * Possibly supported by other POSIX threads implementations
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_delay_np (struct timespec * interval);
+PTW32_DLLPORT int PTW32_CDECL pthread_num_processors_np(void);
+PTW32_DLLPORT unsigned __int64 PTW32_CDECL pthread_getunique_np(pthread_t thread);
+
+/*
+ * Useful if an application wants to statically link
+ * the lib rather than load the DLL at run-time.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_process_attach_np(void);
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_process_detach_np(void);
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_thread_attach_np(void);
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_thread_detach_np(void);
+
+/*
+ * Features that are auto-detected at load/run time.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_test_features_np(int);
+enum ptw32_features {
+  PTW32_SYSTEM_INTERLOCKED_COMPARE_EXCHANGE = 0x0001, /* System provides it. */
+  PTW32_ALERTABLE_ASYNC_CANCEL              = 0x0002  /* Can cancel blocked threads. */
+};
+
+/*
+ * Register a system time change with the library.
+ * Causes the library to perform various functions
+ * in response to the change. Should be called whenever
+ * the application's top level window receives a
+ * WM_TIMECHANGE message. It can be passed directly to
+ * pthread_create() as a new thread if desired.
+ */
+PTW32_DLLPORT void * PTW32_CDECL pthread_timechange_handler_np(void *);
+
+#endif /*PTW32_LEVEL >= PTW32_LEVEL_MAX - 1 */
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+
+/*
+ * Returns the Win32 HANDLE for the POSIX thread.
+ */
+PTW32_DLLPORT HANDLE PTW32_CDECL pthread_getw32threadhandle_np(pthread_t thread);
+/*
+ * Returns the win32 thread ID for POSIX thread.
+ */
+PTW32_DLLPORT DWORD PTW32_CDECL pthread_getw32threadid_np (pthread_t thread);
+
+
+/*
+ * Protected Methods
+ *
+ * This function blocks until the given WIN32 handle
+ * is signaled or pthread_cancel had been called.
+ * This function allows the caller to hook into the
+ * PThreads cancel mechanism. It is implemented using
+ *
+ *              WaitForMultipleObjects
+ *
+ * on 'waitHandle' and a manually reset WIN32 Event
+ * used to implement pthread_cancel. The 'timeout'
+ * argument to TimedWait is simply passed to
+ * WaitForMultipleObjects.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthreadCancelableWait (HANDLE waitHandle);
+PTW32_DLLPORT int PTW32_CDECL pthreadCancelableTimedWait (HANDLE waitHandle,
+                                        DWORD timeout);
+
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+/*
+ * Thread-Safe C Runtime Library Mappings.
+ */
+#if !defined(_UWIN)
+#  if defined(NEED_ERRNO)
+     PTW32_DLLPORT int * PTW32_CDECL _errno( void );
+#  else
+#    if !defined(errno)
+#      if (defined(_MT) || defined(_DLL))
+         __declspec(dllimport) extern int * __cdecl _errno(void);
+#        define errno   (*_errno())
+#      endif
+#    endif
+#  endif
+#endif
+
+/*
+ * Some compiler environments don't define some things.
+ */
+#if defined(__BORLANDC__)
+#  define _ftime ftime
+#  define _timeb timeb
+#endif
+
+#if defined(__cplusplus)
+
+/*
+ * Internal exceptions
+ */
+class ptw32_exception {};
+class ptw32_exception_cancel : public ptw32_exception {};
+class ptw32_exception_exit   : public ptw32_exception {};
+
+#endif
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+
+/* FIXME: This is only required if the library was built using SEH */
+/*
+ * Get internal SEH tag
+ */
+PTW32_DLLPORT DWORD PTW32_CDECL ptw32_get_exception_services_code(void);
+
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+#if !defined(PTW32_BUILD)
+
+#if defined(__CLEANUP_SEH)
+
+/*
+ * Redefine the SEH __except keyword to ensure that applications
+ * propagate our internal exceptions up to the library's internal handlers.
+ */
+#define __except( E ) \
+        __except( ( GetExceptionCode() == ptw32_get_exception_services_code() ) \
+                 ? EXCEPTION_CONTINUE_SEARCH : ( E ) )
+
+#endif /* __CLEANUP_SEH */
+
+#if defined(__CLEANUP_CXX)
+
+/*
+ * Redefine the C++ catch keyword to ensure that applications
+ * propagate our internal exceptions up to the library's internal handlers.
+ */
+#if defined(_MSC_VER)
+        /*
+         * WARNING: Replace any 'catch( ... )' with 'PtW32CatchAll'
+         * if you want Pthread-Win32 cancelation and pthread_exit to work.
+         */
+
+#if !defined(PtW32NoCatchWarn)
+
+#pragma message("Specify \"/DPtW32NoCatchWarn\" compiler flag to skip this message.")
+#pragma message("------------------------------------------------------------------")
+#pragma message("When compiling applications with MSVC++ and C++ exception handling:")
+#pragma message("  Replace any 'catch( ... )' in routines called from POSIX threads")
+#pragma message("  with 'PtW32CatchAll' or 'CATCHALL' if you want POSIX thread")
+#pragma message("  cancelation and pthread_exit to work. For example:")
+#pragma message("")
+#pragma message("    #if defined(PtW32CatchAll)")
+#pragma message("      PtW32CatchAll")
+#pragma message("    #else")
+#pragma message("      catch(...)")
+#pragma message("    #endif")
+#pragma message("        {")
+#pragma message("          /* Catchall block processing */")
+#pragma message("        }")
+#pragma message("------------------------------------------------------------------")
+
+#endif
+
+#define PtW32CatchAll \
+        catch( ptw32_exception & ) { throw; } \
+        catch( ... )
+
+#else /* _MSC_VER */
+
+#define catch( E ) \
+        catch( ptw32_exception & ) { throw; } \
+        catch( E )
+
+#endif /* _MSC_VER */
+
+#endif /* __CLEANUP_CXX */
+
+#endif /* ! PTW32_BUILD */
+
+#if defined(__cplusplus)
+}                               /* End of extern "C" */
+#endif                          /* __cplusplus */
+
+#if defined(PTW32__HANDLE_DEF)
+# undef HANDLE
+#endif
+#if defined(PTW32__DWORD_DEF)
+# undef DWORD
+#endif
+
+#undef PTW32_LEVEL
+#undef PTW32_LEVEL_MAX
+
+#endif /* ! RC_INVOKED */
+
+#endif /* PTHREAD_H */
diff --git a/pll/queue.c b/pll/queue.c
new file mode 100644
index 0000000..eecf3fb
--- /dev/null
+++ b/pll/queue.c
@@ -0,0 +1,96 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file queue.c
+ */
+#include <stdio.h>
+#include "queue.h"
+#include "mem_alloc.h"
+
+int
+pllQueueInit (pllQueue ** q)
+{  
+  *q = (pllQueue *) rax_malloc (sizeof (pllQueue));
+  if (!*q) return (0);
+   
+  (*q)->head = NULL;
+  (*q)->tail = NULL;
+   
+  return (1);
+}  
+
+int 
+pllQueueSize (pllQueue * q)
+{  
+  int n = 0;
+  struct pllQueueItem * elm;
+   
+  if (!q) return (0);
+   
+  for (elm = q->head; elm; elm = elm->next) ++n;
+   
+  return (n);
+}  
+
+int
+pllQueueRemove (pllQueue * q, void ** item)
+{  
+  struct pllQueueItem * elm;
+   
+  if (!q || !q->head) return (0);
+   
+  elm = q->head;
+   
+  *item = elm->item;
+   
+  q->head = q->head->next;
+  if (!q->head)  q->tail = NULL;
+  rax_free (elm);
+   
+  return (1);
+}  
+
+int 
+pllQueueAppend (pllQueue * q, void * item)
+{ 
+  struct pllQueueItem * qitem;
+  if (!q) return (0);
+  
+  qitem = (struct pllQueueItem *) rax_malloc (sizeof (struct pllQueueItem));
+  if (!qitem) return (0);
+  
+  qitem->item = item;
+  qitem->next = NULL;
+  
+  if (!q->head) 
+    q->head = qitem;
+  else
+    q->tail->next = qitem;
+  
+  q->tail = qitem;
+
+  return (1);
+} 
diff --git a/pll/queue.h b/pll/queue.h
new file mode 100644
index 0000000..b359c4a
--- /dev/null
+++ b/pll/queue.h
@@ -0,0 +1,48 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file queue.h
+ */
+#ifndef __pll_QUEUE__
+#define __pll_QUEUE__
+
+struct pllQueueItem
+{  
+  void * item;
+  struct pllQueueItem * next;
+}; 
+   
+typedef struct
+{  
+  struct pllQueueItem * head;
+  struct pllQueueItem * tail;
+} pllQueue; 
+
+int pllQueueInit (pllQueue ** q);
+int pllQueueSize (pllQueue * q);
+int pllQueueRemove (pllQueue * q, void ** item);
+int pllQueueAppend (pllQueue * q, void * item);
+#endif
diff --git a/pll/randomTree.c b/pll/randomTree.c
new file mode 100644
index 0000000..c1d9af4
--- /dev/null
+++ b/pll/randomTree.c
@@ -0,0 +1,177 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file randomTree.c
+ */
+#include "mem_alloc.h"
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+static void insertTaxon (nodeptr p, nodeptr q)
+{
+  nodeptr  r;
+  
+  r = q->back;
+  
+  hookupDefault(p->next,       q);
+  hookupDefault(p->next->next, r);
+} 
+
+static nodeptr buildNewTip (pllInstance *tr, nodeptr p)
+{ 
+  nodeptr  q;
+
+  q = tr->nodep[(tr->nextnode)++];
+  hookupDefault(p, q);
+  q->next->back = (nodeptr)NULL;
+  q->next->next->back = (nodeptr)NULL;
+ 
+  return  q;
+} 
+
+static void buildSimpleTreeRandom (pllInstance *tr, int ip, int iq, int ir)
+{    
+  nodeptr  
+    p, 
+    s;
+  
+  int  
+    i;
+  
+  i = PLL_MIN(ip, iq);
+  if (ir < i)  i = ir; 
+  tr->start = tr->nodep[i];
+  tr->ntips = 3;
+  p = tr->nodep[ip];
+  
+  hookupDefault(p, tr->nodep[iq]);
+  
+  s = buildNewTip(tr, tr->nodep[ir]);
+  
+  insertTaxon(s, p);
+}
+
+static int randomInt(int n, pllInstance *tr)
+{
+  int 
+    res = (int)((double)(n) * randum(&tr->randomNumberSeed));
+
+  assert(res >= 0 && res < n);
+  
+  return res;
+}
+
+void makePermutation(int *perm, int n, pllInstance *tr)
+{    
+  int  
+    i, 
+    j, 
+    k;    
+
+  for (i = 1; i <= n; i++)    
+    perm[i] = i;               
+
+  for (i = 1; i <= n; i++) 
+    {    
+      k =  randomInt(n + 1 - i, tr); /*(int)((double)(n + 1 - i) * randum(&tr->randomNumberSeed));*/
+
+      assert(i + k <= n);
+      
+      j        = perm[i];
+      perm[i]     = perm[i + k];
+      perm[i + k] = j; 
+    }
+}
+
+static int markBranches(nodeptr *branches, nodeptr p, int *counter, int numsp)
+{
+  if(isTip(p->number, numsp))
+    return 0;
+  else
+    {
+      branches[*counter] = p->next;
+      branches[*counter + 1] = p->next->next;
+      
+      *counter = *counter + 2;
+      
+      return ((2 + markBranches(branches, p->next->back, counter, numsp) + 
+	       markBranches(branches, p->next->next->back, counter, numsp)));
+    }
+}
+
+
+
+void pllMakeRandomTree(pllInstance *tr)
+{  
+  nodeptr 
+    p, 
+    f, 
+    randomBranch,
+    *branches = (nodeptr *)rax_malloc(sizeof(nodeptr) * (2 * tr->mxtips));    
+  
+  int 
+    nextsp, 
+    *perm = (int *)rax_malloc((tr->mxtips + 1) * sizeof(int)), 
+    branchCounter;                      
+  
+  makePermutation(perm, tr->mxtips, tr);              
+  
+  tr->ntips = 0;       	       
+  tr->nextnode = tr->mxtips + 1;    
+  
+  buildSimpleTreeRandom(tr, perm[1], perm[2], perm[3]);
+  
+  while(tr->ntips < tr->mxtips) 
+    {	             
+      nextsp = ++(tr->ntips);             
+      p = tr->nodep[perm[nextsp]];            
+      
+      buildNewTip(tr, p);  	
+      
+      f = findAnyTip(tr->start, tr->mxtips);
+      f = f->back;
+      
+      branchCounter = 1;
+      branches[0] = f;
+      markBranches(branches, f, &branchCounter, tr->mxtips);
+
+      assert(branchCounter == ((2 * (tr->ntips - 1)) - 3));
+      
+      randomBranch = branches[randomInt(branchCounter, tr)];
+      
+      insertTaxon(p->back, randomBranch);
+    }
+  
+  rax_free(perm);            
+  rax_free(branches);
+}
+
diff --git a/pll/recom.c b/pll/recom.c
new file mode 100644
index 0000000..5ab20c7
--- /dev/null
+++ b/pll/recom.c
@@ -0,0 +1,689 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file recom.c
+ * @brief Functions used for recomputation of vectors (only a fraction of LH vectors stored in RAM)   
+ */
+#include "mem_alloc.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <time.h>
+#include <math.h>
+#ifndef WIN32
+#include <sys/time.h>
+#endif
+#include "pll.h"
+#include "pllInternal.h"
+
+/** @brief Locks node \a nodenum to force it remains availably in memory
+ *
+ * @warning If a node is available we dont need to recompute it, but we neet to make sure it is not unpinned while buildding the rest of the traversal descriptor, i.e. unpinnable must be PLL_FALSE at this point, it will automatically be set to PLL_TRUE, after the counter post-order instructions have been executed 
+Omitting this call the traversal will likely still work as long as num_allocated_nodes >> log n, but wrong inner vectors will be used at the wrong moment of pllNewviewIterative, careful! 
+ *
+ *  @param rvec 
+ *    Recomputation info
+ *
+ *  @param nodenum
+ *    Node id that must remain available in memory 
+ *
+ *  @param mxtips
+ *    Number of tips in the tree
+ *
+ */
+void protectNode(recompVectors *rvec, int nodenum, int mxtips)
+{
+
+  int slot;
+  slot = rvec->iNode[nodenum - mxtips - 1];
+  assert(slot != PLL_NODE_UNPINNED);
+  assert(rvec->iVector[slot] == nodenum);
+
+  if(rvec->unpinnable[slot])
+    rvec->unpinnable[slot] = PLL_FALSE;
+}
+
+/** @brief Checks if \a nodenum  is currently pinned (available in RAM)
+ *
+ *  @note shall we document static functions? 
+ * 
+ *  @param rvec 
+ *    Recomputation info
+ *
+ *  @param nodenum
+ *    Node id to be checked
+ *
+ *  @param mxtips
+ *    Number of tips in the tree
+ *
+ */
+static pllBoolean isNodePinned(recompVectors *rvec, int nodenum, int mxtips)
+{
+  assert(nodenum > mxtips);
+
+  if(rvec->iNode[nodenum - mxtips - 1] == PLL_NODE_UNPINNED)
+    return PLL_FALSE;
+  else
+    return PLL_TRUE;
+}
+
+/** @brief Checks if the likelihood entries at node \a p should be updated
+ *
+ * A node needs update if one of the following holds:
+ *    1. It is not oriented (p->x == 0) 
+ *    2. We are applying recomputations and node \a p is not currently available in RAM
+ *  
+ *  @param recompute 
+ *    PLL_TRUE if recomputation is currently applied 
+ *
+ *  @param p
+ *    Node to check whether it is associated with the likelihood vector
+ *
+ *  @param mxtips
+ *    Number of tips in the tree
+ *
+ */
+pllBoolean needsRecomp(pllBoolean recompute, recompVectors *rvec, nodeptr p, int mxtips)
+{ 
+  if((!p->x) || (recompute && !isNodePinned(rvec, p->number, mxtips)))
+    return PLL_TRUE;
+  else
+    return PLL_FALSE;
+}
+
+
+
+/** @brief Allocates memory for recomputation structure
+ *  
+ *  
+ *  @todo this should not depend on tr (\a vectorRecomFraction should be a parameter)
+ *    PLL_TRUE if recomputation is currently applied 
+ *
+ */
+void allocRecompVectorsInfo(pllInstance *tr)
+{
+  recompVectors 
+    *v = (recompVectors *) rax_malloc(sizeof(recompVectors));
+
+  int 
+    num_inner_nodes = tr->mxtips - 2,
+                    num_vectors, 
+                    i;
+
+  assert(tr->vectorRecomFraction > PLL_MIN_RECOM_FRACTION);
+  assert(tr->vectorRecomFraction < PLL_MAX_RECOM_FRACTION);
+
+  num_vectors = (int) (1 + tr->vectorRecomFraction * (float)num_inner_nodes); 
+
+  int theoretical_minimum_of_vectors = 3 + ((int)(log((double)tr->mxtips)/log(2.0)));
+  //printBothOpen("Try to use %d ancestral vectors, min required %d\n", num_vectors, theoretical_minimum_of_vectors);
+
+  assert(num_vectors >= theoretical_minimum_of_vectors);
+  assert(num_vectors < tr->mxtips);
+
+
+  v->numVectors = num_vectors; /* use minimum bound theoretical */
+
+  /* init vectors tracking */
+
+  v->iVector         = (int *) rax_malloc((size_t)num_vectors * sizeof(int));
+  v->unpinnable      = (pllBoolean *) rax_malloc((size_t)num_vectors * sizeof(pllBoolean));
+
+  for(i = 0; i < num_vectors; i++)
+  {
+    v->iVector[i]         = PLL_SLOT_UNUSED;
+    v->unpinnable[i]      = PLL_FALSE;
+  }
+
+  v->iNode      = (int *) rax_malloc((size_t)num_inner_nodes * sizeof(int));
+  v->stlen      = (int *) rax_malloc((size_t)num_inner_nodes * sizeof(int));
+
+  for(i = 0; i < num_inner_nodes; i++)
+  {
+    v->iNode[i] = PLL_NODE_UNPINNED;
+    v->stlen[i] = PLL_INNER_NODE_INIT_STLEN;
+  }
+
+  v->allSlotsBusy = PLL_FALSE;
+
+  /* init nodes tracking */
+
+  v->maxVectorsUsed = 0;
+  tr->rvec = v;
+}
+
+/** @brief Find the slot id with the minimum cost to be recomputed.
+ *  
+ *  The minum cost is defined as the minimum subtree size. In general, the closer a vector is to the tips, 
+ *  the less recomputations are required to re-establish its likelihood entries
+ *
+ *  @todo remove _DEBUG_RECOMPUTATION code
+ *  
+ *  @param v
+ *
+ *  @param mxtips
+ *    Number of tips in the tree
+ *
+ */
+static int findUnpinnableSlotByCost(recompVectors *v, int mxtips)
+{
+  int 
+    i, 
+    slot, 
+    cheapest_slot = -1, 
+    min_cost = mxtips * 2; /* more expensive than the most expensive*/
+#ifdef _DEBUG_RECOMPUTATION 
+  double straTime = gettime();
+#endif 
+
+
+  for(i = 0; i < mxtips - 2; i++)
+  {
+    slot = v->iNode[i];
+    if(slot != PLL_NODE_UNPINNED)
+    {
+      assert(slot >= 0 && slot < v->numVectors);
+
+      if(v->unpinnable[slot])
+      {
+        assert(v->stlen[i] > 0);
+
+        if(v->stlen[i] < min_cost)
+        {
+          min_cost = v->stlen[i];
+          cheapest_slot = slot;
+          /* if the slot costs 2 you can break cause there is nothing cheaper to recompute */
+          if(min_cost == 2)
+            break;
+        }
+      }
+    }
+  }
+  assert(min_cost < mxtips * 2 && min_cost >= 2);
+  assert(cheapest_slot >= 0);
+  return cheapest_slot;
+}
+
+static void unpinAtomicSlot(recompVectors *v, int slot, int mxtips)
+{
+  int 
+    nodenum = v->iVector[slot];
+
+  v->iVector[slot] = PLL_SLOT_UNUSED;
+
+  if(nodenum != PLL_SLOT_UNUSED)  
+    v->iNode[nodenum - mxtips - 1] = PLL_NODE_UNPINNED; 
+}
+
+/** @brief Finds the cheapest slot and unpins it
+ *
+ */
+static int findUnpinnableSlot(recompVectors *v, int mxtips)
+{
+  int     
+    slot_unpinned = findUnpinnableSlotByCost(v, mxtips);
+
+  assert(slot_unpinned >= 0);
+  assert(v->unpinnable[slot_unpinned]);
+
+  unpinAtomicSlot(v, slot_unpinned, mxtips);
+
+  return slot_unpinned;
+}
+
+/** @brief Finds a free slot 
+ * 
+ *  If all slots are occupied, it will find the cheapest slot and unpin it
+ *
+ */
+static int findFreeSlot(recompVectors *v, int mxtips)
+{
+  int 
+    slotno = -1, 
+           i;
+
+  assert(v->allSlotsBusy == PLL_FALSE);
+
+  for(i = 0; i < v->numVectors; i++)
+  {
+    if(v->iVector[i] == PLL_SLOT_UNUSED)
+    {
+      slotno = i;
+      break;
+    } 
+  }
+
+  if(slotno == -1)
+  {
+    v->allSlotsBusy = PLL_TRUE;
+    slotno = findUnpinnableSlot(v, mxtips);
+  }
+
+  return slotno;
+}
+
+
+/** @brief Pins node \a nodenum to slot \a slot
+ *  
+ *  The slot is initialized as non-unpinnable (ensures that the contents of the vector will not be overwritten)
+ *
+ *  @param nodenum
+ *    node id
+ *
+ *  @param slot
+ *    slot id 
+ *    
+ *  @param mxtips
+ *    Number of tips in the tree
+ *
+ */
+static void pinAtomicNode(recompVectors *v, int nodenum, int slot, int mxtips)
+{
+  v->iVector[slot] = nodenum;
+  v->iNode[nodenum - mxtips - 1] = slot;
+  v->unpinnable[slot] = PLL_FALSE;
+}
+
+static int pinNode(recompVectors *rvec, int nodenum, int mxtips)
+{
+  int 
+    slot;
+
+  assert(!isNodePinned(rvec, nodenum, mxtips));
+
+  if(rvec->allSlotsBusy)
+    slot = findUnpinnableSlot(rvec, mxtips);
+  else
+    slot = findFreeSlot(rvec, mxtips);
+
+  assert(slot >= 0);
+
+  pinAtomicNode(rvec, nodenum, slot, mxtips);
+
+  if(slot > rvec->maxVectorsUsed)
+    rvec->maxVectorsUsed = slot;
+
+  assert(slot == rvec->iNode[nodenum - mxtips - 1]);
+
+  return slot;
+}
+
+/** @brief Marks node \a nodenum as unpinnable
+ *  
+ *  The slot holding the node \a nodenum is added to the pool of slot candidates that can be overwritten.
+ *
+ *  @param v
+ *    Recomputation info
+ *    
+ *  @param nodenum
+ *    node id
+ *    
+ *  @param mxtips
+ *    Number of tips in the tree
+ *
+ */
+void unpinNode(recompVectors *v, int nodenum, int mxtips)
+{
+  if(nodenum <= mxtips)
+    return;
+  else
+  {
+    int 
+      slot = -1;
+
+    assert(nodenum > mxtips);
+    slot = v->iNode[nodenum-mxtips-1];
+    assert(slot >= 0 && slot < v->numVectors); 
+
+    if(slot >= 0 && slot < v->numVectors)
+      v->unpinnable[slot] = PLL_TRUE;
+  }
+}
+
+
+/** @brief Get a pinned slot \a slot that holds the likelihood vector for inner node \a nodenum
+ *  
+ *  If node \a node nodenum is not pinned to any slot yet, the minimum cost replacement strategy is used.
+ *
+ *  @param v
+ *    Recomputation info
+ *    
+ *  @param nodenum
+ *    node id
+ *    
+ *  @param slot
+ *    slot id
+ *
+ *  @param mxtips
+ *    Number of tips in the tree
+ *
+ */
+pllBoolean getxVector(recompVectors *rvec, int nodenum, int *slot, int mxtips)
+{
+  pllBoolean 
+    slotNeedsRecomp = PLL_FALSE;
+
+  *slot = rvec->iNode[nodenum - mxtips - 1];
+
+  if(*slot == PLL_NODE_UNPINNED)
+  {
+    *slot = pinNode(rvec, nodenum, mxtips); /* now we will run the replacement strategy */
+    slotNeedsRecomp = PLL_TRUE;
+  }
+
+  assert(*slot >= 0 && *slot < rvec->numVectors);
+
+  rvec->unpinnable[*slot] = PLL_FALSE;
+
+  return slotNeedsRecomp;
+}
+
+
+#ifdef _DEBUG_RECOMPUTATION
+
+static int subtreeSize(nodeptr p, int maxTips)
+{
+  if(isTip(p->number, maxTips))
+    return 1;
+  else   
+    return (subtreeSize(p->next->back, maxTips) + subtreeSize(p->next->next->back, maxTips));
+}
+
+#endif
+
+/** @brief Annotes unoriented tree nodes \a tr with their subtree size 
+ *  
+ *  This function recursively updates the subtree size of each inner node.
+ *  @note The subtree size of node \a p->number is the number of nodes included in the subtree where node record \a p is the virtual root. 
+ *
+ *  @param p
+ *    Pointer to node 
+ *    
+ *  @param maxTips
+ *    Number of tips in the tree
+ *
+ *  @param rvec 
+ *    Recomputation info
+ *    
+ *  @param count
+ *    Number of visited nodes 
+ */
+void computeTraversalInfoStlen(nodeptr p, int maxTips, recompVectors *rvec, int *count) 
+{
+  if(isTip(p->number, maxTips))
+    return;
+  else
+  {          
+    nodeptr 
+      q = p->next->back,
+        r = p->next->next->back;
+
+    *count += 1;
+    /* set xnode info at this point */     
+
+    if(isTip(r->number, maxTips) && isTip(q->number, maxTips))  
+    {
+      rvec->stlen[p->number - maxTips - 1] = 2;	
+
+#ifdef _DEBUG_RECOMPUTATION
+      assert(rvec->stlen[p->number - maxTips - 1] == subtreeSize(p, maxTips));
+#endif
+    }
+    else
+    {
+      if(isTip(r->number, maxTips) || isTip(q->number, maxTips))
+      {	     
+        nodeptr 
+          tmp;
+
+        if(isTip(r->number, maxTips))
+        {
+          tmp = r;
+          r = q;
+          q = tmp;
+        }
+
+        if(!r->x)
+          computeTraversalInfoStlen(r, maxTips, rvec, count);
+
+        rvec->stlen[p->number - maxTips - 1] = rvec->stlen[r->number - maxTips - 1] + 1;
+
+#ifdef _DEBUG_RECOMPUTATION	      
+        assert(rvec->stlen[p->number - maxTips - 1] == subtreeSize(p, maxTips));
+#endif
+      }
+      else
+      {		 
+        if(!r->x)
+          computeTraversalInfoStlen(r, maxTips, rvec, count);
+        if(!q->x)
+          computeTraversalInfoStlen(q, maxTips, rvec, count); 
+
+        rvec->stlen[p->number - maxTips - 1] = rvec->stlen[q->number - maxTips - 1] + rvec->stlen[r->number - maxTips - 1];	
+
+#ifdef _DEBUG_RECOMPUTATION
+        assert(rvec->stlen[p->number - maxTips - 1] == subtreeSize(p, maxTips));
+#endif
+      }
+    }
+  }
+}
+
+
+
+
+/* pre-compute the node stlens (this needs to be known prior to running the strategy) */
+/** @brief Annotes all tree nodes \a tr with their subtree size 
+ *  
+ *  Similar to \a computeTraversalInfoStlen, but does a full traversal ignoring orientation.
+ *  The minum cost is defined as the minimum subtree size. In general, the closer a vector is to the tips, 
+ *  the less recomputations are required to re-establish its likelihood entries
+ *
+ *  @param p
+ *    Pointer to node 
+ *    
+ *  @param maxTips
+ *    Number of tips in the tree
+ *
+ *  @param rvec 
+ *    Recomputation info
+ */
+void computeFullTraversalInfoStlen(nodeptr p, int maxTips, recompVectors *rvec) 
+{
+  if(isTip(p->number, maxTips))
+    return;
+  else
+  {    
+    nodeptr 
+      q = p->next->back,
+        r = p->next->next->back;     
+
+    if(isTip(r->number, maxTips) && isTip(q->number, maxTips))
+    {	  
+      rvec->stlen[p->number - maxTips - 1] = 2;
+
+#ifdef _DEBUG_RECOMPUTATION
+      assert(rvec->stlen[p->number - maxTips - 1] == subtreeSize(p, maxTips));
+#endif
+    }
+    else
+    {	    
+      if(isTip(r->number, maxTips) || isTip(q->number, maxTips))
+      {	  	      
+        nodeptr 
+          tmp;
+
+        if(isTip(r->number, maxTips))
+        {
+          tmp = r;
+          r = q;
+          q = tmp;
+        }
+
+        computeFullTraversalInfoStlen(r, maxTips, rvec);
+
+        rvec->stlen[p->number - maxTips - 1] = rvec->stlen[r->number - maxTips - 1] + 1;	   
+
+#ifdef _DEBUG_RECOMPUTATION
+        assert(rvec->stlen[p->number - maxTips - 1] == subtreeSize(p, maxTips));
+#endif
+      }
+      else
+      {	    	     	      
+        computeFullTraversalInfoStlen(r, maxTips, rvec);
+        computeFullTraversalInfoStlen(q, maxTips, rvec); 
+
+        rvec->stlen[p->number - maxTips - 1] = rvec->stlen[q->number - maxTips - 1] + rvec->stlen[r->number - maxTips - 1];
+#ifdef _DEBUG_RECOMPUTATION
+        assert(rvec->stlen[p->number - maxTips - 1] == subtreeSize(p, maxTips));
+#endif
+      }
+    }
+  }
+}
+
+
+#ifdef _DEBUG_RECOMPUTATION
+
+void allocTraversalCounter(pllInstance *tr)
+{
+  traversalCounter 
+    *tc;
+
+  int 
+    k;
+
+  tc = (traversalCounter *)rax_malloc(sizeof(traversalCounter));
+
+  tc->travlenFreq = (unsigned int *)rax_malloc(tr->mxtips * sizeof(int));
+
+  for(k = 0; k < tr->mxtips; k++)
+    tc->travlenFreq[k] = 0;
+
+  tc->tt = 0;
+  tc->ti = 0;
+  tc->ii = 0;
+  tc->numTraversals = 0;
+  tr->travCounter = tc;
+}
+
+/* recomp */
+/* code to track traversal descriptor stats */
+
+void countTraversal(pllInstance *tr)
+{
+  traversalInfo 
+    *ti   = tr->td[0].ti;
+  int i;
+  traversalCounter *tc = tr->travCounter; 
+  tc->numTraversals += 1;
+
+  /*
+  printBothOpen("trav #%d(%d):",tc->numTraversals, tr->td[0].count);
+  */
+
+  for(i = 1; i < tr->td[0].count; i++)
+  {
+    traversalInfo *tInfo = &ti[i];
+
+    /* 
+       printBothOpen(" %d q%d r%d |",  tInfo->pNumber, tInfo->qNumber, tInfo->rNumber);
+       printBothOpen("%d",  tInfo->pNumber);
+       */
+    switch(tInfo->tipCase)
+    {
+      case PLL_TIP_TIP: 
+        tc->tt++; 
+        /* printBothOpen("T"); */
+        break;		  
+      case PLL_TIP_INNER: 
+        tc->ti++; 
+        /* printBothOpen("M"); */
+        break;		  
+
+      case PLL_INNER_INNER: 
+        tc->ii++; 
+        /* printBothOpen("I"); */
+        break;		  
+      default: 
+        assert(0);
+    }
+    /* printBothOpen(" "); */
+  }
+  /* printBothOpen(" so far T %d, M %d, I %d \n", tc->tt, tc->ti,tc->ii); */
+  tc->travlenFreq[tr->td[0].count] += 1;
+}
+
+
+/*
+void printTraversalInfo(pllInstance *tr)
+{
+  int 
+    k, 
+    total_steps = 0;
+
+  printBothOpen("Traversals : %d \n", tr->travCounter->numTraversals);
+  printBothOpen("Traversals tt: %d \n", tr->travCounter->tt);
+  printBothOpen("Traversals ti: %d \n", tr->travCounter->ti);
+  printBothOpen("Traversals ii: %d \n", tr->travCounter->ii);
+  printBothOpen("all: %d \n", tr->travCounter->tt + tr->travCounter->ii + tr->travCounter->ti);
+  printBothOpen("Traversals len freq  : \n");
+  
+  for(k = 0; k < tr->mxtips; k++)
+  {
+    total_steps += tr->travCounter->travlenFreq[k] * (k - 1);
+    if(tr->travCounter->travlenFreq[k] > 0)
+      printBothOpen("len %d : %d\n", k, tr->travCounter->travlenFreq[k]);
+  }
+  printBothOpen("all steps: %d \n", total_steps);
+}
+*/
+/*end code to track traversal descriptor stats */
+/* E recomp */
+
+/*
+void printVector(double *vector, int len, char *name)
+{ 
+  int i;
+  printBothOpen("LHVECTOR %s :", name);
+  for(i=0; i < len; i++)
+  {
+    printBothOpen("%.2f ", vector[i]);
+    if(i>10)
+    {
+      printBothOpen("...");
+      break; 
+    }
+  } 
+  printBothOpen("\n");
+} 
+*/
+
+#endif
+
diff --git a/pll/restartHashTable.c b/pll/restartHashTable.c
new file mode 100644
index 0000000..007e247
--- /dev/null
+++ b/pll/restartHashTable.c
@@ -0,0 +1,357 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file bipartitionList.c
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h> 
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+/*
+static pllBoolean treeNeedString(const char *fp, char c1, int *position)
+{
+  char 
+    c2 = fp[(*position)++];
+  
+  if(c2 == c1)  
+    return PLL_TRUE;
+  else  
+    {   
+      int 
+	lower = PLL_MAX(0, *position - 20),
+	upper = *position + 20;
+      
+      printf("Tree Parsing ERROR: Expecting '%c', found: '%c'\n", c1, c2); 
+      printf("Context: \n");
+      
+      while(lower < upper && fp[lower])
+	printf("%c", fp[lower++]);
+      
+      printf("\n");
+
+      return PLL_FALSE;
+  }
+} 
+
+
+static pllBoolean treeLabelEndString (char ch)
+{
+  switch(ch) 
+    {   
+    case '\0':  
+    case '\t':  
+    case '\n':  
+    case '\r': 
+    case ' ':
+    case ':':  
+    case ',':   
+    case '(':   
+    case ')':  
+    case ';':
+      return PLL_TRUE;
+    default:
+      break;
+    }
+  
+  return PLL_FALSE;
+} 
+
+static pllBoolean  treeGetLabelString (const char *fp, char *lblPtr, int maxlen, int *position)
+{
+  char 
+    ch;
+  
+  pllBoolean  
+    done, 
+    lblfound;
+
+  if (--maxlen < 0) 
+    lblPtr = (char *)NULL; 
+  else 
+    if(lblPtr == NULL) 
+      maxlen = 0;
+
+  ch = fp[(*position)++];
+  
+  done = treeLabelEndString(ch);
+
+  lblfound = !done;  
+
+  while(!done) 
+    {      
+      if(treeLabelEndString(ch)) 
+	break;     
+
+      if(--maxlen >= 0) 
+	*lblPtr++ = ch;
+      
+      ch = fp[(*position)++];      
+    }
+  
+  (*position)--; 
+
+  if (lblPtr != NULL) 
+    *lblPtr = '\0';
+
+  return lblfound;
+}
+
+static pllBoolean  treeFlushLabelString(const char *fp, int *position)
+{ 
+  return  treeGetLabelString(fp, (char *) NULL, (int) 0, position);
+} 
+
+static pllBoolean treeProcessLengthString (const char *fp, double *dptr, int *position)
+{ 
+  (*position)++;
+  
+  if(sscanf(&fp[*position], "%lf", dptr) != 1) 
+    {
+      printf("ERROR: treeProcessLength: Problem reading branch length\n");     
+      assert(0);
+    }
+
+  while(fp[*position] != ',' && fp[*position] != ')' && fp[*position] != ';')
+    *position = *position + 1;
+  
+  return  PLL_TRUE;
+}
+
+static int treeFlushLenString (const char *fp, int *position)
+{
+  double  
+    dummy;  
+  
+  char     
+    ch;
+
+  ch = fp[(*position)++];
+ 
+  if(ch == ':') 
+    {     
+      if(!treeProcessLengthString(fp, &dummy, position)) 
+	return 0;
+      return 1;	  
+    }
+    
+  (*position)--;
+
+  return 1;
+} 
+
+static int treeFindTipByLabelString(char  *str, pllInstance *tr)                    
+{
+  int lookup = lookupWord(str, tr->nameHash);
+
+  if(lookup > 0)
+    {
+      assert(! tr->nodep[lookup]->back);
+      return lookup;
+    }
+  else
+    { 
+      printf("ERROR: Cannot find tree species: %s\n", str);
+      return  0;
+    }
+}
+
+static int treeFindTipNameString (const char *fp, pllInstance *tr, int *position)
+{
+  char    str[PLL_NMLNGTH + 2];
+  int      n;
+
+  if (treeGetLabelString (fp, str, PLL_NMLNGTH + 2, position))
+    n = treeFindTipByLabelString(str, tr);
+  else
+    n = 0;
+   
+  return  n;
+} 
+
+static pllBoolean addElementLenString(const char *fp, pllInstance *tr, nodeptr p, int *position)
+{
+  nodeptr  
+    q;
+  
+  int      
+    n, 
+    fres;
+
+  char 
+    ch;
+  
+  if ((ch = fp[(*position)++]) == '(') 
+    { 
+      n = (tr->nextnode)++;
+      if (n > 2*(tr->mxtips) - 2) 
+	{
+	  if (tr->rooted || n > 2*(tr->mxtips) - 1) 
+	    {
+	      printf("ERROR: Too many internal nodes.  Is tree rooted?\n");
+	      printf("       Deepest splitting should be a trifurcation.\n");
+	      return PLL_FALSE;
+	    }
+	  else 
+	    {	   
+	      tr->rooted = PLL_TRUE;
+	    }
+	}
+      
+      q = tr->nodep[n];
+
+      if (!addElementLenString(fp, tr, q->next, position))        
+	return PLL_FALSE;
+      if (!treeNeedString(fp, ',', position))             
+	return PLL_FALSE;
+      if (!addElementLenString(fp, tr, q->next->next, position))  
+	return PLL_FALSE;
+      if (!treeNeedString(fp, ')', position))             
+	return PLL_FALSE;
+      
+     
+      treeFlushLabelString(fp, position);
+    }
+  else 
+    {   
+      (*position)--;
+     
+      if ((n = treeFindTipNameString(fp, tr, position)) <= 0)          
+	return PLL_FALSE;
+      q = tr->nodep[n];
+      
+      if (tr->start->number > n)  
+	tr->start = q;
+      (tr->ntips)++;
+    }
+  
+     
+  fres = treeFlushLenString(fp, position);
+  if(!fres) 
+    return PLL_FALSE;
+  
+  hookupDefault(p, q);
+
+  return PLL_TRUE;          
+}
+
+
+
+void treeReadTopologyString(char *treeString, pllInstance *tr)
+{ 
+  char 
+    *fp = treeString;
+
+  nodeptr  
+    p;
+  
+  int
+    position = 0, 
+    i;
+  
+  char 
+    ch;   
+    
+
+  for(i = 1; i <= tr->mxtips; i++)    
+    tr->nodep[i]->back = (node *)NULL;      
+  
+  for(i = tr->mxtips + 1; i < 2 * tr->mxtips; i++)
+    {
+      tr->nodep[i]->back = (nodeptr)NULL;
+      tr->nodep[i]->next->back = (nodeptr)NULL;
+      tr->nodep[i]->next->next->back = (nodeptr)NULL;
+      tr->nodep[i]->number = i;
+      tr->nodep[i]->next->number = i;
+      tr->nodep[i]->next->next->number = i;           
+    }
+      
+  tr->start       = tr->nodep[1];
+  tr->ntips       = 0;
+  tr->nextnode    = tr->mxtips + 1;    
+  tr->rooted      = PLL_FALSE;      
+  
+  p = tr->nodep[(tr->nextnode)++]; 
+   
+  assert(fp[position++] == '(');  
+    
+  if (! addElementLenString(fp, tr, p, &position))                 
+    assert(0);
+  
+  if (! treeNeedString(fp, ',', &position))                
+    assert(0);
+   
+  if (! addElementLenString(fp, tr, p->next, &position))           
+    assert(0);
+
+  if(!tr->rooted) 
+    {
+      if ((ch = fp[position++]) == ',') 
+	{ 
+	  if (! addElementLenString(fp, tr, p->next->next, &position)) 
+	    assert(0);	 
+	}
+      else 
+	assert(0);     
+    }
+  else
+    assert(0);
+        
+  if (! treeNeedString(fp, ')', &position))                
+    assert(0);
+
+  treeFlushLabelString(fp, &position);
+  
+  if (!treeFlushLenString(fp, &position))                         
+    assert(0);
+  
+  if (!treeNeedString(fp, ';', &position))       
+    assert(0);
+    
+  if(tr->rooted)     
+    assert(0);           
+  else           
+    tr->start = tr->nodep[1];   
+
+  printf("Tree parsed\n");
+
+} 
+*/
diff --git a/pll/sched.h b/pll/sched.h
new file mode 100644
index 0000000..f36a97a
--- /dev/null
+++ b/pll/sched.h
@@ -0,0 +1,183 @@
+/*
+ * Module: sched.h
+ *
+ * Purpose:
+ *      Provides an implementation of POSIX realtime extensions
+ *      as defined in 
+ *
+ *              POSIX 1003.1b-1993      (POSIX.1b)
+ *
+ * --------------------------------------------------------------------------
+ *
+ *      Pthreads-win32 - POSIX Threads Library for Win32
+ *      Copyright(C) 1998 John E. Bossom
+ *      Copyright(C) 1999,2005 Pthreads-win32 contributors
+ * 
+ *      Contact Email: rpj at callisto.canberra.edu.au
+ * 
+ *      The current list of contributors is contained
+ *      in the file CONTRIBUTORS included with the source
+ *      code distribution. The list can also be seen at the
+ *      following World Wide Web location:
+ *      http://sources.redhat.com/pthreads-win32/contributors.html
+ * 
+ *      This library is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU Lesser General Public
+ *      License as published by the Free Software Foundation; either
+ *      version 2 of the License, or (at your option) any later version.
+ * 
+ *      This library is distributed in the hope that it will be useful,
+ *      but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *      Lesser General Public License for more details.
+ * 
+ *      You should have received a copy of the GNU Lesser General Public
+ *      License along with this library in the file COPYING.LIB;
+ *      if not, write to the Free Software Foundation, Inc.,
+ *      59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+#if !defined(_SCHED_H)
+#define _SCHED_H
+
+#undef PTW32_SCHED_LEVEL
+
+#if defined(_POSIX_SOURCE)
+#define PTW32_SCHED_LEVEL 0
+/* Early POSIX */
+#endif
+
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309
+#undef PTW32_SCHED_LEVEL
+#define PTW32_SCHED_LEVEL 1
+/* Include 1b, 1c and 1d */
+#endif
+
+#if defined(INCLUDE_NP)
+#undef PTW32_SCHED_LEVEL
+#define PTW32_SCHED_LEVEL 2
+/* Include Non-Portable extensions */
+#endif
+
+#define PTW32_SCHED_LEVEL_MAX 3
+
+#if ( defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112 )  || !defined(PTW32_SCHED_LEVEL)
+#define PTW32_SCHED_LEVEL PTW32_SCHED_LEVEL_MAX
+/* Include everything */
+#endif
+
+
+#if defined(__GNUC__) && !defined(__declspec)
+# error Please upgrade your GNU compiler to one that supports __declspec.
+#endif
+
+/*
+ * When building the library, you should define PTW32_BUILD so that
+ * the variables/functions are exported correctly. When using the library,
+ * do NOT define PTW32_BUILD, and then the variables/functions will
+ * be imported correctly.
+ */
+#if !defined(PTW32_STATIC_LIB)
+#  if defined(PTW32_BUILD)
+#    define PTW32_DLLPORT __declspec (dllexport)
+#  else
+#    define PTW32_DLLPORT __declspec (dllimport)
+#  endif
+#else
+#  define PTW32_DLLPORT
+#endif
+
+/*
+ * This is a duplicate of what is in the autoconf config.h,
+ * which is only used when building the pthread-win32 libraries.
+ */
+
+#if !defined(PTW32_CONFIG_H)
+#  if defined(WINCE)
+#    define NEED_ERRNO
+#    define NEED_SEM
+#  endif
+#  if defined(__MINGW64__)
+#    define HAVE_STRUCT_TIMESPEC
+#    define HAVE_MODE_T
+#  elif defined(_UWIN) || defined(__MINGW32__)
+#    define HAVE_MODE_T
+#  endif
+#endif
+
+/*
+ *
+ */
+
+#if PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX
+#if defined(NEED_ERRNO)
+#include "need_errno.h"
+#else
+#include <errno.h>
+#endif
+#endif /* PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX */
+
+#if (defined(__MINGW64__) || defined(__MINGW32__)) || defined(_UWIN)
+# if PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX
+/* For pid_t */
+#  include <sys/types.h>
+/* Required by Unix 98 */
+#  include <time.h>
+# else
+   typedef int pid_t;
+# endif
+#else
+ typedef int pid_t;
+#endif
+
+/* Thread scheduling policies */
+
+enum {
+  SCHED_OTHER = 0,
+  SCHED_FIFO,
+  SCHED_RR,
+  SCHED_MIN   = SCHED_OTHER,
+  SCHED_MAX   = SCHED_RR
+};
+
+struct sched_param {
+  int sched_priority;
+};
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif                          /* __cplusplus */
+
+PTW32_DLLPORT int __cdecl sched_yield (void);
+
+PTW32_DLLPORT int __cdecl sched_get_priority_min (int policy);
+
+PTW32_DLLPORT int __cdecl sched_get_priority_max (int policy);
+
+PTW32_DLLPORT int __cdecl sched_setscheduler (pid_t pid, int policy);
+
+PTW32_DLLPORT int __cdecl sched_getscheduler (pid_t pid);
+
+/*
+ * Note that this macro returns ENOTSUP rather than
+ * ENOSYS as might be expected. However, returning ENOSYS
+ * should mean that sched_get_priority_{min,max} are
+ * not implemented as well as sched_rr_get_interval.
+ * This is not the case, since we just don't support
+ * round-robin scheduling. Therefore I have chosen to
+ * return the same value as sched_setscheduler when
+ * SCHED_RR is passed to it.
+ */
+#define sched_rr_get_interval(_pid, _interval) \
+  ( errno = ENOTSUP, (int) -1 )
+
+
+#if defined(__cplusplus)
+}                               /* End of extern "C" */
+#endif                          /* __cplusplus */
+
+#undef PTW32_SCHED_LEVEL
+#undef PTW32_SCHED_LEVEL_MAX
+
+#endif                          /* !_SCHED_H */
+
diff --git a/pll/searchAlgo.c b/pll/searchAlgo.c
new file mode 100644
index 0000000..c638d48
--- /dev/null
+++ b/pll/searchAlgo.c
@@ -0,0 +1,3310 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file searchAlgo.c
+ * @brief Collection of routines for performing likelihood computation and branch optimization.
+ *
+ * Detailed description to appear soon.
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h> 
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+typedef struct bInf {
+  double likelihood;
+  nodeptr node;
+} bestInfo;
+
+typedef struct iL {
+  bestInfo *list;
+  int n;
+  int valid;
+} infoList;
+
+double treeOptimizeRapid(pllInstance *tr, partitionList *pr, int mintrav, int maxtrav, bestlist *bt, infoList *iList);
+nniMove getBestNNIForBran(pllInstance* tr, partitionList *pr, nodeptr p, double curLH);
+void evalNNIForSubtree(pllInstance* tr, partitionList *pr, nodeptr p, nniMove* nniList, int* cnt, int* cnt_nni, double curLH);
+
+
+static int cmp_nni(const void* nni1, const void* nni2);
+static void pllTraverseUpdate (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q, int mintrav, int maxtrav, pllRearrangeList * bestList);
+static int pllStoreRearrangement (pllRearrangeList * bestList, pllRearrangeInfo * rearr);
+static int pllTestInsertBIG (pllInstance * tr, partitionList * pr, nodeptr p, nodeptr q, pllRearrangeList * bestList);
+static int pllTestSPR (pllInstance * tr, partitionList * pr, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList);
+static void pllCreateSprInfoRollback (pllInstance * tr, pllRearrangeInfo * rearr, int numBranches);
+static void pllCreateNniInfoRollback (pllInstance * tr, pllRearrangeInfo * rearr);
+static void pllCreateRollbackInfo (pllInstance * tr, pllRearrangeInfo * rearr, int numBranches);
+static void pllRollbackNNI (pllInstance * tr, partitionList * pr, pllRollbackInfo * ri);
+static void pllRollbackSPR (partitionList * pr, pllRollbackInfo * ri);
+
+extern partitionLengths pLengths[PLL_MAX_MODEL];
+
+pllBoolean initrav (pllInstance *tr, partitionList *pr, nodeptr p)
+{ 
+  nodeptr  q;
+
+  if (!isTip(p->number, tr->mxtips)) 
+  {      
+    q = p->next;
+
+    do 
+    {	   
+      if (! initrav(tr, pr, q->back))  return PLL_FALSE;
+      q = q->next;	
+    } 
+    while (q != p);  
+
+    pllUpdatePartials(tr, pr, p, PLL_FALSE);
+  }
+
+  return PLL_TRUE;
+} 
+
+
+/** @brief Optimize the length of a specific branch
+
+    Optimize the length of the branch connecting \a p and \a p->back
+    for each partition (\a tr->numBranches) in library instance \a tr.
+ 
+    @param tr
+      The library instance
+
+    @param pr
+      Partition list
+ 
+    @param p
+      Endpoints of branch to be optimized 
+*/
+void update(pllInstance *tr, partitionList *pr, nodeptr p)
+{       
+  nodeptr  q; 
+  int i;
+  double   z[PLL_NUM_BRANCHES], z0[PLL_NUM_BRANCHES];
+  int numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+
+  #ifdef _DEBUG_UPDATE
+    double 
+      startLH;
+  
+    pllEvaluateLikelihood (tr, p);
+  
+    startLH = tr->likelihood;
+  #endif
+
+  q = p->back;   
+
+  for(i = 0; i < numBranches; i++)
+    z0[i] = q->z[i];    
+
+  if(numBranches > 1)
+    makenewzGeneric(tr, pr, p, q, z0, PLL_NEWZPERCYCLE, z, PLL_TRUE);
+  else
+    makenewzGeneric(tr, pr, p, q, z0, PLL_NEWZPERCYCLE, z, PLL_FALSE);
+
+  for(i = 0; i < numBranches; i++)
+  {         
+    if(!tr->partitionConverged[i])
+    {	  
+      if(PLL_ABS(z[i] - z0[i]) > PLL_DELTAZ)  
+      {	      
+        tr->partitionSmoothed[i] = PLL_FALSE;
+      }	 
+
+      p->z[i] = q->z[i] = z[i];	 
+    }
+  }
+ 
+  #ifdef _DEBUG_UPDATE
+    pllEvaluateLikelihood (tr, p);
+  
+    if(tr->likelihood <= startLH)
+      {
+        if(fabs(tr->likelihood - startLH) > 0.01)
+  	{
+  	  printf("%f %f\n", startLH, tr->likelihood);
+  	  assert(0);      
+  	}
+      }
+  #endif
+}
+
+/** @brief Branch length optimization of subtree
+
+    Optimize the length of branch connected by \a p and \a p->back, and the
+    lengths of all branches in the subtrees rooted at \a p->next and \a p->next->next
+
+    @param tr
+      The library instance
+
+    @param pr
+      Partition list
+
+    @param p
+      Endpoint of branches to be optimized
+*/
+void smooth (pllInstance *tr, partitionList *pr, nodeptr p)
+{
+  nodeptr  q;
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  update(tr, pr, p);    /*  Adjust branch */
+
+  if (! isTip(p->number, tr->mxtips)) 
+  {                                  /*  Adjust descendants */
+    q = p->next;
+    while (q != p) 
+    {
+      smooth(tr, pr, q->back);
+      q = q->next;
+    }	
+
+    if(numBranches > 1 && !tr->useRecom)
+      pllUpdatePartials(tr, pr,p, PLL_TRUE);
+    else
+      pllUpdatePartials(tr, pr,p, PLL_FALSE);
+  }
+} 
+
+/**  @brief Check whether the branches in all partitions have been optimized
+ 
+     Check if all branches in all partitions have reached the threshold for
+     optimization. If at least one branch can be optimized further return \b PLL_FALSE.
+
+     @param tr
+       The library instance 
+
+     @return
+       If at least one branch can be further optimized return \b PLL_FALSE,
+       otherwise \b PLL_TRUE.
+             
+*/
+static pllBoolean allSmoothed(pllInstance *tr, int numBranches)
+{
+  int i;
+  pllBoolean result = PLL_TRUE;
+
+  for(i = 0; i < numBranches; i++)
+  {
+    if(tr->partitionSmoothed[i] == PLL_FALSE)
+      result = PLL_FALSE;
+    else
+      tr->partitionConverged[i] = PLL_TRUE;
+  }
+
+  return result;
+}
+
+
+/** @brief Optimize all branch lenghts of a tree
+  
+    Perform \a maxtimes rounds of branch length optimization by running smooth()
+    on all neighbour nodes of node \a tr->start.
+
+    @param tr
+      The library instance
+
+    @param maxtimes
+      Number of optimization rounds to perform
+*/
+/* do maxtimes rounds of branch length optimization */
+void smoothTree (pllInstance *tr, partitionList *pr, int maxtimes)
+{
+	nodeptr  p, q;
+	int i, count = 0;
+    int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+	p = tr->start;
+	for(i = 0; i < numBranches; i++)
+		tr->partitionConverged[i] = PLL_FALSE;
+
+	while (--maxtimes >= 0)
+	{
+		for(i = 0; i < numBranches; i++)
+			tr->partitionSmoothed[i] = PLL_TRUE;
+
+		smooth(tr, pr, p->back);
+		if (!isTip(p->number, tr->mxtips))
+		{
+			q = p->next;
+			while (q != p)
+			{
+				smooth(tr, pr, q->back);
+				q = q->next;
+			}
+		}
+		count++;
+
+		if (allSmoothed(tr, numBranches)) break;
+	}
+
+	for(i = 0; i < numBranches; i++)
+		tr->partitionConverged[i] = PLL_FALSE;
+} 
+
+
+/** @brief Optimize the branch length of edges around a specific node
+    
+    Optimize \a maxtimes the branch length of all (3) edges around a given node 
+    \a p of the tree of library instance \a tr.
+
+    @param tr
+      The library instance
+
+    @param p
+      The node around which to optimize the edges
+
+    @param maxtimes
+      Number of optimization rounds to perform
+*/
+void localSmooth (pllInstance *tr, partitionList *pr, nodeptr p, int maxtimes)
+{ 
+  nodeptr  q;
+  int i;
+  int numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+  if (isTip(p->number, tr->mxtips)) return;
+
+  for(i = 0; i < PLL_NUM_BRANCHES; i++)
+    tr->partitionConverged[i] = PLL_FALSE;	
+
+  while (--maxtimes >= 0) 
+  {     
+    for(i = 0; i < PLL_NUM_BRANCHES; i++)
+      tr->partitionSmoothed[i] = PLL_TRUE;
+
+    q = p;
+    do 
+    {
+      update(tr, pr, q);
+      q = q->next;
+    } 
+    while (q != p);
+
+    if (allSmoothed(tr, numBranches))
+      break;
+  }
+
+  for(i = 0; i < PLL_NUM_BRANCHES; i++)
+  {
+    tr->partitionSmoothed[i] = PLL_FALSE; 
+    tr->partitionConverged[i] = PLL_FALSE;
+  }
+}
+
+
+
+
+/** @brief Reset an \a infoList
+
+    Resets an \a infoList by setting elements \a node and \a likelihood
+    of each element of the \a bestInfo list structure to \b NULL and
+    \a PLL_UNLIKELY, respectively.
+
+    @param iList
+      The given \a infoList.
+*/
+static void resetInfoList(infoList *iList)
+{
+  int 
+    i;
+
+  iList->valid = 0;
+
+  for(i = 0; i < iList->n; i++)    
+  {
+    iList->list[i].node = (nodeptr)NULL;
+    iList->list[i].likelihood = PLL_UNLIKELY;
+  }    
+}
+
+/** @brief Initialize an \a infoList
+
+    Initialize an \a infoList by creating a \a bestInfo list structure
+    of \a n elements and setting the attributes \a node and \a likelihood
+    of each element of the \a bestInfo list structure to \b NULL and
+    \a PLL_UNLIKELY, respectively.
+
+    @param iList
+      The given \a infoList.
+
+    @param n
+      Number of elements to be created in the \a bestInfo list.
+*/
+static void initInfoList(infoList *iList, int n)
+{
+  int 
+    i;
+
+  iList->n = n;
+  iList->valid = 0;
+  iList->list = (bestInfo *)rax_malloc(sizeof(bestInfo) * (size_t)n);
+
+  for(i = 0; i < n; i++)
+  {
+    iList->list[i].node = (nodeptr)NULL;
+    iList->list[i].likelihood = PLL_UNLIKELY;
+  }
+}
+
+/** @brief Deallocate the contents of an \a infoList
+    
+    Deallocate the contents of a given \a infoList by freeing
+    the memory used by its \a bestInfo list structure.
+
+    @param iList
+      The \a infoList to be used.
+*/
+static void freeInfoList(infoList *iList)
+{ 
+  rax_free(iList->list);   
+}
+
+
+/** @brief Insert a record in an \a infoList
+
+    Insert the pair \a likelihood and \node into list \a iList 
+    \b only if there already exists a pair in \a iList 
+    whose \a likelihood attribute is smaller than the given \a 
+    likelihood. The insertion is done by replacing the smallest
+    likelihood pair with the new pair.
+
+    @param node
+      The given node
+
+    @param likelihood
+      The given likelihood
+
+    @param iList
+      The given \a infoList where the record will possibly be appended.
+*/
+static void insertInfoList(nodeptr node, double likelihood, infoList *iList)
+{
+  int 
+    i,
+    min = 0;
+
+  double 
+    min_l =  iList->list[0].likelihood;
+
+  for(i = 1; i < iList->n; i++)
+  {
+    if(iList->list[i].likelihood < min_l)
+    {
+      min = i;
+      min_l = iList->list[i].likelihood;
+    }
+  }
+
+  if(likelihood > min_l)
+  {
+    iList->list[min].likelihood = likelihood;
+    iList->list[min].node = node;
+    if(iList->valid < iList->n)
+      iList->valid += 1;
+  }
+}
+
+
+/** @brief  Optimize branch lengths of region
+
+    Optimize the branch lenghts of only a specific region. The branch optimization starts
+    at a node \a p and is carried out in all nodes with distance upto \a region edges from 
+    \a p.
+
+    @param tr
+      The library instance.
+    
+    @param p
+      Node to start branch optimization from.
+
+    @param region
+      The allowed node distance from \p for which to still perform branch optimization.
+*/
+void smoothRegion (pllInstance *tr, partitionList *pr, nodeptr p, int region)
+{ 
+  nodeptr  q;
+
+  update(tr, pr, p);   /* Adjust branch */
+
+  if (region > 0)
+  {
+    if (!isTip(p->number, tr->mxtips)) 
+    {                                 
+      q = p->next;
+      while (q != p) 
+      {
+        smoothRegion(tr, pr, q->back, --region);
+        q = q->next;
+      }	
+
+      pllUpdatePartials(tr, pr,p, PLL_FALSE);
+    }
+  }
+}
+
+/** @brief Wrapper function for optimizing the branch length of a region \a maxtimes times
+
+    Optimize the branch lengths of a specific region \a maxtimes times. The branch optimization
+    starts at a given node \a p and is carried out in all nodes with distance upto \a region
+    from \a p.
+
+    @param tr
+      The library instance.
+
+    @param p
+      Node to start branch optimization from.
+
+    @param maxtimes
+      Number of times to perform branch optimization.
+
+    @param region
+      The allwed node distance from \p for which to still perform branch optimization.
+
+    @todo
+      In the previous version (before the model-sep merge) the loops were controlled by tr->numBranches,
+      and now they are controlled by a constant PLL_NUM_BRANCHES. What is right?
+*/
+void regionalSmooth (pllInstance *tr, partitionList *pr, nodeptr p, int maxtimes, int region)
+{
+  nodeptr  q;
+  int i;
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  if (isTip(p->number, tr->mxtips)) return;            /* Should be an error */
+
+  for(i = 0; i < PLL_NUM_BRANCHES; i++)
+    tr->partitionConverged[i] = PLL_FALSE;
+
+  while (--maxtimes >= 0) 
+  {	
+    for(i = 0; i < PLL_NUM_BRANCHES; i++)
+      tr->partitionSmoothed[i] = PLL_TRUE;
+
+    q = p;
+    do 
+    {
+      smoothRegion(tr, pr, q, region);
+      q = q->next;
+    } 
+    while (q != p);
+
+    if (allSmoothed(tr, numBranches))
+      break;
+  }
+
+  for(i = 0; i < PLL_NUM_BRANCHES; i++) {
+    tr->partitionSmoothed[i] = PLL_FALSE;
+    tr->partitionConverged[i] = PLL_FALSE;
+  }
+} 
+
+
+
+
+/** @brief Split the tree into two components and optimize new branch length
+
+   Split the tree into two components. The disconnection point is node \a p.
+   First, a branch length is computed for the newly created branch between nodes
+   \a p->next->back and \a p->next->next->back and then the two nodes are
+   connected (hookup). Disconnection is done by setting \a p->next->next->back
+   and \a p->next->back to \b NULL.
+
+   @param tr
+     The library instance
+
+   @param p
+     The node at which the tree should be decomposed into two components.
+
+   @param numBranches
+     Number of branches per partition
+
+   @return
+     Node from the disconnected component
+
+   @todo
+     Why do we return this node?
+
+   @image html removeBIG.png "The diagram shows in blue color the new edge that is created and in red the edges that are removed"
+*/
+nodeptr  removeNodeBIG (pllInstance *tr, partitionList *pr, nodeptr p, int numBranches)
+{  
+//  double   zqr[numBranches], result[numBranches];
+  double*   zqr = rax_malloc(numBranches*sizeof(double)), *result = rax_malloc(numBranches*sizeof(double));
+  nodeptr  q, r;
+  int i;
+
+  q = p->next->back;
+  r = p->next->next->back;
+
+  for(i = 0; i < numBranches; i++)
+    zqr[i] = q->z[i] * r->z[i];        
+
+  makenewzGeneric(tr, pr, q, r, zqr, PLL_ITERATIONS, result, PLL_FALSE);
+
+  for(i = 0; i < numBranches; i++)        
+    tr->zqr[i] = result[i];
+
+  hookup(q, r, result, numBranches); 
+
+  p->next->next->back = p->next->back = (node *) NULL;
+
+  rax_free(result);
+  rax_free(zqr);
+  return  q; 
+}
+
+/** @brief Split the tree into two components and recompute likelihood
+
+    Split the tree into two component. The disconnection point is node \a p.
+    Set the branch length of the new node between \a p->next->back and
+    \a p->next->next->back to \a tr->currentZQR and then decompose the tree
+    into two components by setting \a p->next->back and \a p->next->next->back
+    to \b NULL.
+
+    @param tr
+      The library instance
+
+    @param p
+      The node at which the tree should be decomposed into two components.
+
+    @return q
+      the node after \a p
+
+    @todo
+      Why do we return this node? Why do we set to tr->currentZQR and not compute
+      new optimized length? What is tr->currentZQR? 
+*/
+nodeptr  removeNodeRestoreBIG (pllInstance *tr, partitionList *pr, nodeptr p)
+{
+  nodeptr  q, r;
+
+  q = p->next->back;
+  r = p->next->next->back;  
+
+  pllUpdatePartials(tr, pr,q, PLL_FALSE);
+  pllUpdatePartials(tr, pr,r, PLL_FALSE);
+
+  hookup(q, r, tr->currentZQR, pr->perGeneBranchLengths?pr->numberOfPartitions:1);
+
+  p->next->next->back = p->next->back = (node *) NULL;
+
+  return  q;
+}
+
+/** @brief Connect two disconnected tree components
+   
+   Connect two disconnected components by specifying an internal edge from one
+   component and a leaf from the other component. The internal edge \a e is the
+   edge between \a q and \a q->back. The leaf is specified by \a p.
+   Edge \a e is removed and two new edges are created. The first one is an edge
+   between \a p->next and \a q, and the second one is between \a p->next->next
+   and \a q->back. The new likelihood vector for node \a p is computed.
+
+   @note The function makes use of the \a thoroughInsertion flag
+
+   @todo
+     What is tr->lzi ? What is thorough insertion? Why do we optimize branch lengths
+     that will be removed? Add explanation
+
+   @image html pll.png "The diagram shows in blue colors the new edges that are created and in red the edge that is removed" 
+*/
+pllBoolean insertBIG (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q)
+{
+  nodeptr  r, s;
+  int i;
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  r = q->back;
+  s = p->back;
+
+  for(i = 0; i < numBranches; i++)
+    tr->lzi[i] = q->z[i];
+
+  if(tr->thoroughInsertion)
+  { 
+	  double * zqr = rax_malloc(numBranches*sizeof(double)), 
+		  *zqs = rax_malloc(numBranches*sizeof(double)), 
+		  *zrs = rax_malloc(numBranches*sizeof(double));
+	  double lzqr, lzqs, lzrs, lzsum, lzq, lzr, lzs, lzmax;
+    double *defaultArray=rax_malloc(numBranches*sizeof(double));
+	double *e1 = rax_malloc(numBranches*sizeof(double)),
+		*e2 = rax_malloc(numBranches*sizeof(double)),
+		*e3 = rax_malloc(numBranches*sizeof(double));
+    double *qz;
+
+    qz = q->z;
+
+    for(i = 0; i < numBranches; i++)
+      defaultArray[i] = PLL_DEFAULTZ;
+
+    makenewzGeneric(tr, pr, q, r, qz, PLL_ITERATIONS, zqr, PLL_FALSE);
+    /* the branch lengths values will be estimated using q, r and s
+     * q-s are not connected, but both q and s have a valid LH vector , so we can call makenewzGeneric  to get a value for
+     * lzsum, which is then use to generate reasonable starting values e1, e2, e3 for the new branches we create after the       insertion
+     */
+
+    makenewzGeneric(tr, pr, q, s, defaultArray, PLL_ITERATIONS, zqs, PLL_FALSE);
+    makenewzGeneric(tr, pr, r, s, defaultArray, PLL_ITERATIONS, zrs, PLL_FALSE);
+
+
+    for(i = 0; i < numBranches; i++)
+    {
+      lzqr = (zqr[i] > PLL_ZMIN) ? log(zqr[i]) : log(PLL_ZMIN); 
+      lzqs = (zqs[i] > PLL_ZMIN) ? log(zqs[i]) : log(PLL_ZMIN);
+      lzrs = (zrs[i] > PLL_ZMIN) ? log(zrs[i]) : log(PLL_ZMIN);
+      lzsum = 0.5 * (lzqr + lzqs + lzrs);
+
+      lzq = lzsum - lzrs;
+      lzr = lzsum - lzqs;
+      lzs = lzsum - lzqr;
+      lzmax = log(PLL_ZMAX);
+
+      if      (lzq > lzmax) {lzq = lzmax; lzr = lzqr; lzs = lzqs;} 
+      else if (lzr > lzmax) {lzr = lzmax; lzq = lzqr; lzs = lzrs;}
+      else if (lzs > lzmax) {lzs = lzmax; lzq = lzqs; lzr = lzrs;}          
+
+      e1[i] = exp(lzq);
+      e2[i] = exp(lzr);
+      e3[i] = exp(lzs);
+    }
+    hookup(p->next,       q, e1, numBranches);
+    hookup(p->next->next, r, e2, numBranches);
+    hookup(p,             s, e3, numBranches);      		  
+	rax_free(e3);
+	rax_free(e2);
+	rax_free(e1);
+	rax_free(defaultArray);
+	rax_free(zrs);
+	rax_free(zqs);
+	rax_free(zqr);
+
+  }
+  else
+  {       
+	  double  *z = rax_malloc(numBranches*sizeof(double));
+
+    for(i = 0; i < numBranches; i++)
+    {
+      z[i] = sqrt(q->z[i]);      
+
+      if(z[i] < PLL_ZMIN) 
+        z[i] = PLL_ZMIN;
+      if(z[i] > PLL_ZMAX)
+        z[i] = PLL_ZMAX;
+    }
+
+    hookup(p->next,       q, z, numBranches);
+    hookup(p->next->next, r, z, numBranches);
+	rax_free(z);
+  }
+
+  pllUpdatePartials(tr, pr,p, PLL_FALSE);
+
+  if(tr->thoroughInsertion)
+  {     
+    localSmooth(tr, pr, p, PLL_MAX_LOCAL_SMOOTHING_ITERATIONS);
+    for(i = 0; i < numBranches; i++)
+    {
+      tr->lzq[i] = p->next->z[i];
+      tr->lzr[i] = p->next->next->z[i];
+      tr->lzs[i] = p->z[i];            
+    }
+  }           
+
+  return  PLL_TRUE;
+}
+
+/** @brief Connect two disconnected tree components without optimizing branch lengths
+   
+   Connect two disconnected components by specifying an internal edge from one
+   component and a leaf from the other component. The internal edge \a e is the
+   edge between \a q and \a q->back. The leaf is specified by \a p.
+   Edge \a e is removed and two new edges are created. The first one is an edge
+   between \a p->next and \a q, and the second one is between \a p->next->next
+   and \a q->back. The new likelihood vector for node \a p is computed.
+
+   @note The function makes use of the \a thoroughInsertion flag
+
+   @todo
+     What is the difference between this and insertBIG? 
+*/
+pllBoolean insertRestoreBIG (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q)
+{
+  nodeptr  r, s;
+
+  r = q->back;
+  s = p->back;
+
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  if(tr->thoroughInsertion)
+  {                        
+    hookup(p->next,       q, tr->currentLZQ, numBranches);
+    hookup(p->next->next, r, tr->currentLZR, numBranches);
+    hookup(p,             s, tr->currentLZS, numBranches);
+  }
+  else
+  {       
+    double  z[PLL_NUM_BRANCHES];
+    int i;
+
+    for(i = 0; i < numBranches; i++)
+    {
+      double zz;
+      zz = sqrt(q->z[i]);     
+      if(zz < PLL_ZMIN) 
+        zz = PLL_ZMIN;
+      if(zz > PLL_ZMAX)
+        zz = PLL_ZMAX;
+      z[i] = zz;
+    }
+
+    hookup(p->next,       q, z, numBranches);
+    hookup(p->next->next, r, z, numBranches);
+  }   
+
+  pllUpdatePartials(tr, pr,p, PLL_FALSE);
+
+  return  PLL_TRUE;
+}
+
+
+static void restoreTopologyOnly(pllInstance *tr, bestlist *bt, int numBranches)
+{ 
+  nodeptr p = tr->removeNode;
+  nodeptr q = tr->insertNode;
+  double qz[PLL_NUM_BRANCHES], pz[PLL_NUM_BRANCHES], p1z[PLL_NUM_BRANCHES], p2z[PLL_NUM_BRANCHES];
+  nodeptr p1, p2, r, s;
+  double currentLH = tr->likelihood;
+  int i;
+
+  p1 = p->next->back;
+  p2 = p->next->next->back;
+
+  //memcpy(p1z, p1->z, numBranches*sizeof(double));
+  //memcpy(p2z, p2->z, numBranches*sizeof(double));
+  //memcpy(qz, q->z, numBranches*sizeof(double));
+  //memcpy(pz, p->z, numBranches*sizeof(double));
+  for(i = 0; i < numBranches; i++)
+  {
+    p1z[i] = p1->z[i];
+    p2z[i] = p2->z[i];
+  }
+
+  hookup(p1, p2, tr->currentZQR, numBranches);
+
+  p->next->next->back = p->next->back = (node *) NULL;             
+  for(i = 0; i < numBranches; i++)
+  {
+    qz[i] = q->z[i];
+    pz[i] = p->z[i];
+  }
+
+  r = q->back;
+  s = p->back;
+
+  if(tr->thoroughInsertion)
+  {                        
+    hookup(p->next,       q, tr->currentLZQ, numBranches);
+    hookup(p->next->next, r, tr->currentLZR, numBranches);
+    hookup(p,             s, tr->currentLZS, numBranches);
+  }
+  else
+  { 	
+    double  z[PLL_NUM_BRANCHES];	
+    for(i = 0; i < numBranches; i++)
+    {
+      z[i] = sqrt(q->z[i]);      
+      if(z[i] < PLL_ZMIN)
+        z[i] = PLL_ZMIN;
+      if(z[i] > PLL_ZMAX)
+        z[i] = PLL_ZMAX;
+    }
+    hookup(p->next,       q, z, numBranches);
+    hookup(p->next->next, r, z, numBranches);
+  }     
+
+  tr->likelihood = tr->bestOfNode;
+
+  saveBestTree(bt, tr, numBranches);
+
+  tr->likelihood = currentLH;
+
+  hookup(q, r, qz, numBranches);
+
+  p->next->next->back = p->next->back = (nodeptr) NULL;
+
+  if(tr->thoroughInsertion)    
+    hookup(p, s, pz, numBranches);
+
+  hookup(p->next,       p1, p1z, numBranches);
+  hookup(p->next->next, p2, p2z, numBranches);
+}
+
+/** @brief Test the 
+*/
+pllBoolean testInsertBIG (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q)
+{
+
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  double  qz[PLL_NUM_BRANCHES], pz[PLL_NUM_BRANCHES];
+  nodeptr  r;
+  double startLH = tr->endLH;
+  int i;
+
+  r = q->back; 
+  for(i = 0; i < numBranches; i++)
+  {
+    qz[i] = q->z[i];
+    pz[i] = p->z[i];
+  }
+
+  if (! insertBIG(tr, pr, p, q))       return PLL_FALSE;
+
+  pllEvaluateLikelihood (tr, pr, p->next->next, PLL_FALSE, PLL_FALSE);
+
+  if(tr->likelihood > tr->bestOfNode)
+  {
+    tr->bestOfNode = tr->likelihood;
+    tr->insertNode = q;
+    tr->removeNode = p;   
+    for(i = 0; i < numBranches; i++)
+    {
+      tr->currentZQR[i] = tr->zqr[i];           
+      tr->currentLZR[i] = tr->lzr[i];
+      tr->currentLZQ[i] = tr->lzq[i];
+      tr->currentLZS[i] = tr->lzs[i];      
+    }
+  }
+
+  if(tr->likelihood > tr->endLH)
+  {			  
+    tr->insertNode = q;
+    tr->removeNode = p;   
+    for(i = 0; i < numBranches; i++)
+      tr->currentZQR[i] = tr->zqr[i];      
+    tr->endLH = tr->likelihood;                      
+  }        
+
+  /* reset the topology so that it is the same as it was before calling insertBIG */
+  hookup(q, r, qz, numBranches);
+
+  p->next->next->back = p->next->back = (nodeptr) NULL;
+
+  if(tr->thoroughInsertion)
+  {
+    nodeptr s = p->back;
+    hookup(p, s, pz, numBranches);
+  } 
+
+  if((tr->doCutoff) && (tr->likelihood < startLH))
+  {
+    tr->lhAVG += (startLH - tr->likelihood);
+    tr->lhDEC++;
+    if((startLH - tr->likelihood) >= tr->lhCutoff)
+      return PLL_FALSE;	    
+    else
+      return PLL_TRUE;
+  }
+  else
+    return PLL_TRUE;
+}
+
+
+/** @brief Recursively traverse tree and test insertion
+
+    Recursively traverses the tree structure starting from node \a q and
+    tests the insertion of the component specified by leaf \a p at the edge
+    between \a q and \a q->back.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+    @param p
+      Leaf node of one tree component
+
+    @param q
+      Endpoint node of the edge to test the insertion
+
+    @param mintrav
+      Minimum radius around \a q to test the insertion
+
+    @param maxtrav
+      Maximum radius around \a q to test the insertion\
+*/
+void addTraverseBIG(pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q, int mintrav, int maxtrav)
+{  
+  if (--mintrav <= 0) 
+  {              
+    if (! testInsertBIG(tr, pr, p, q))  return;
+
+  }
+
+  if ((!isTip(q->number, tr->mxtips)) && (--maxtrav > 0)) 
+  {    
+    addTraverseBIG(tr, pr, p, q->next->back, mintrav, maxtrav);
+    addTraverseBIG(tr, pr, p, q->next->next->back, mintrav, maxtrav);
+  }
+} 
+
+
+
+
+/** @brief  Compute the  best SPR movement
+
+    Compute all SPR moves starting from \a p in the space defined by \a mintrav and
+    \a maxtrav and store the best in the \a tr structure.
+
+    @param tr
+      PLL instancve
+
+    @param pr
+      List of partitions
+
+    @param p
+      Node from which to start the SPR moves testing
+
+    @param mintrav
+      Minimum distance from \a p where to start testing SPRs
+
+    @param maxtrav
+      Maximum distance from \a p where to test SPRs
+
+    @return
+       0,1 or \b PLL_BADREAR
+
+    @todo
+      fix the return value
+*/
+int rearrangeBIG(pllInstance *tr, partitionList *pr, nodeptr p, int mintrav, int maxtrav)
+{  
+  double   p1z[PLL_NUM_BRANCHES], p2z[PLL_NUM_BRANCHES], q1z[PLL_NUM_BRANCHES], q2z[PLL_NUM_BRANCHES];
+  nodeptr  p1, p2, q, q1, q2;
+  int      mintrav2, i;  
+  pllBoolean doP = PLL_TRUE, doQ = PLL_TRUE;
+  int numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+
+  if (maxtrav < 1 || mintrav > maxtrav)  return (0);
+  q = p->back;
+
+
+
+
+  if (!isTip(p->number, tr->mxtips) && doP) 
+  {     
+    p1 = p->next->back;
+    p2 = p->next->next->back;
+
+
+    if(!isTip(p1->number, tr->mxtips) || !isTip(p2->number, tr->mxtips))
+    {
+      for(i = 0; i < numBranches; i++)
+      {
+        p1z[i] = p1->z[i];
+        p2z[i] = p2->z[i];	   	   
+      }
+
+      if (! removeNodeBIG(tr, pr, p,  numBranches)) return PLL_BADREAR;
+
+      if (!isTip(p1->number, tr->mxtips)) 
+      {
+        addTraverseBIG(tr, pr, p, p1->next->back,
+            mintrav, maxtrav);         
+
+        addTraverseBIG(tr, pr, p, p1->next->next->back,
+            mintrav, maxtrav);          
+      }
+
+      if (!isTip(p2->number, tr->mxtips)) 
+      {
+        addTraverseBIG(tr, pr, p, p2->next->back,
+            mintrav, maxtrav);
+        addTraverseBIG(tr, pr, p, p2->next->next->back,
+            mintrav, maxtrav);          
+      }
+
+      hookup(p->next,       p1, p1z, numBranches);
+      hookup(p->next->next, p2, p2z, numBranches);
+      pllUpdatePartials(tr, pr,p, PLL_FALSE);
+    }
+  }  
+
+  if (!isTip(q->number, tr->mxtips) && maxtrav > 0 && doQ) 
+  {
+    q1 = q->next->back;
+    q2 = q->next->next->back;
+
+    /*if (((!q1->tip) && (!q1->next->back->tip || !q1->next->next->back->tip)) ||
+      ((!q2->tip) && (!q2->next->back->tip || !q2->next->next->back->tip))) */
+    if (
+        (
+         ! isTip(q1->number, tr->mxtips) && 
+         (! isTip(q1->next->back->number, tr->mxtips) || ! isTip(q1->next->next->back->number, tr->mxtips))
+        )
+        ||
+        (
+         ! isTip(q2->number, tr->mxtips) && 
+         (! isTip(q2->next->back->number, tr->mxtips) || ! isTip(q2->next->next->back->number, tr->mxtips))
+        )
+       )
+    {
+
+      for(i = 0; i < numBranches; i++)
+      {
+        q1z[i] = q1->z[i];
+        q2z[i] = q2->z[i];
+      }
+
+      if (! removeNodeBIG(tr, pr, q, numBranches)) return PLL_BADREAR;
+
+      mintrav2 = mintrav > 2 ? mintrav : 2;
+
+      if (/*! q1->tip*/ !isTip(q1->number, tr->mxtips)) 
+      {
+        addTraverseBIG(tr, pr, q, q1->next->back,
+            mintrav2 , maxtrav);
+        addTraverseBIG(tr, pr, q, q1->next->next->back,
+            mintrav2 , maxtrav);         
+      }
+
+      if (/*! q2->tip*/ ! isTip(q2->number, tr->mxtips)) 
+      {
+        addTraverseBIG(tr, pr, q, q2->next->back,
+            mintrav2 , maxtrav);
+        addTraverseBIG(tr, pr, q, q2->next->next->back,
+            mintrav2 , maxtrav);          
+      }	   
+
+      hookup(q->next,       q1, q1z, numBranches);
+      hookup(q->next->next, q2, q2z, numBranches);
+
+      pllUpdatePartials(tr, pr,q, PLL_FALSE);
+    }
+  } 
+
+  return  1;
+} 
+
+
+
+
+/** @brief Perform an SPR move?
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param mintrav
+
+    @param maxtrav
+
+    @param adef
+
+    @param bt
+
+    @param iList
+
+*/
+double treeOptimizeRapid(pllInstance *tr, partitionList *pr, int mintrav, int maxtrav, bestlist *bt, infoList *iList)
+{
+  int i, index,
+      *perm = (int*)NULL;   
+
+  nodeRectifier(tr);
+
+
+
+  if (maxtrav > tr->mxtips - 3)  
+    maxtrav = tr->mxtips - 3;  
+
+
+
+  resetInfoList(iList);
+
+  resetBestTree(bt);
+
+  tr->startLH = tr->endLH = tr->likelihood;
+
+  if(tr->doCutoff)
+  {
+    if(tr->bigCutoff)
+    {	  
+      if(tr->itCount == 0)    
+        tr->lhCutoff = 0.5 * (tr->likelihood / -1000.0);    
+      else    		 
+        tr->lhCutoff = 0.5 * ((tr->lhAVG) / ((double)(tr->lhDEC))); 	  
+    }
+    else
+    {
+      if(tr->itCount == 0)    
+        tr->lhCutoff = tr->likelihood / -1000.0;    
+      else    		 
+        tr->lhCutoff = (tr->lhAVG) / ((double)(tr->lhDEC));   
+    }    
+
+    tr->itCount = tr->itCount + 1;
+    tr->lhAVG = 0;
+    tr->lhDEC = 0;
+  }
+
+  /*
+     printf("DoCutoff: %d\n", tr->doCutoff);
+     printf("%d %f %f %f\n", tr->itCount, tr->lhAVG, tr->lhDEC, tr->lhCutoff);
+
+     printf("%d %d\n", mintrav, maxtrav);
+     */
+
+  for(i = 1; i <= tr->mxtips + tr->mxtips - 2; i++)
+  {           
+    tr->bestOfNode = PLL_UNLIKELY;          
+
+    if(tr->permuteTreeoptimize)
+      index = perm[i];
+    else
+      index = i;     
+
+    if(rearrangeBIG(tr, pr, tr->nodep[index], mintrav, maxtrav))
+    {    
+      if(tr->thoroughInsertion)
+      {
+        if(tr->endLH > tr->startLH)                 	
+        {			   
+          /* commit the best SPR found by rearrangeBIG */
+          restoreTreeFast(tr, pr);    
+          tr->startLH = tr->endLH = tr->likelihood;	 
+          saveBestTree(bt, tr, pr->perGeneBranchLengths?pr->numberOfPartitions:1);
+        }
+        else
+        { 		  
+          if(tr->bestOfNode != PLL_UNLIKELY)
+            restoreTopologyOnly(tr, bt, pr->perGeneBranchLengths?pr->numberOfPartitions:1);
+        }	   
+      }
+      else
+      {
+        insertInfoList(tr->nodep[index], tr->bestOfNode, iList);	    
+        if(tr->endLH > tr->startLH)                 	
+        {		      
+          restoreTreeFast(tr, pr);
+          tr->startLH = tr->endLH = tr->likelihood;	  	 	  	  	  	  	  	  
+        }	    	  
+      }
+    }     
+  }     
+
+  if(!tr->thoroughInsertion)
+  {           
+    tr->thoroughInsertion = PLL_TRUE;  
+
+    for(i = 0; i < iList->valid; i++)
+    { 	  
+      tr->bestOfNode = PLL_UNLIKELY;
+
+      if(rearrangeBIG(tr, pr, iList->list[i].node, mintrav, maxtrav))
+      {	  
+        if(tr->endLH > tr->startLH)                 	
+        {	 	     
+          restoreTreeFast(tr, pr);
+          tr->startLH = tr->endLH = tr->likelihood;	 
+          saveBestTree(bt, tr, pr->perGeneBranchLengths?pr->numberOfPartitions:1);
+        }
+        else
+        { 
+
+          if(tr->bestOfNode != PLL_UNLIKELY)
+          {	     
+            restoreTopologyOnly(tr, bt, pr->perGeneBranchLengths?pr->numberOfPartitions:1);
+          }	
+        }      
+      }
+    }       
+
+    tr->thoroughInsertion = PLL_FALSE;
+  }
+
+  if(tr->permuteTreeoptimize)
+    rax_free(perm);
+
+  return tr->startLH;     
+}
+
+
+
+
+pllBoolean testInsertRestoreBIG (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q)
+{    
+  if(tr->thoroughInsertion)
+  {
+    if (! insertBIG(tr, pr, p, q))       return PLL_FALSE;
+
+    pllEvaluateLikelihood (tr, pr, p->next->next, PLL_FALSE, PLL_FALSE);
+  }
+  else
+  {
+    if (! insertRestoreBIG(tr, pr, p, q))       return PLL_FALSE;
+
+    {
+      nodeptr x, y;
+      x = p->next->next;
+      y = p->back;
+
+      if(! isTip(x->number, tr->mxtips) && isTip(y->number, tr->mxtips))
+      {
+        while ((! x->x)) 
+        {
+          if (! (x->x))
+            pllUpdatePartials(tr, pr,x, PLL_FALSE);
+        }
+      }
+
+      if(isTip(x->number, tr->mxtips) && !isTip(y->number, tr->mxtips))
+      {
+        while ((! y->x)) 
+        {		  
+          if (! (y->x))
+            pllUpdatePartials(tr, pr,y, PLL_FALSE);
+        }
+      }
+
+      if(!isTip(x->number, tr->mxtips) && !isTip(y->number, tr->mxtips))
+      {
+        while ((! x->x) || (! y->x)) 
+        {
+          if (! (x->x))
+            pllUpdatePartials(tr, pr,x, PLL_FALSE);
+          if (! (y->x))
+            pllUpdatePartials(tr, pr,y, PLL_FALSE);
+        }
+      }				      	
+
+    }
+
+    tr->likelihood = tr->endLH;
+  }
+
+  return PLL_TRUE;
+} 
+
+void restoreTreeFast(pllInstance *tr, partitionList *pr)
+{
+  removeNodeRestoreBIG(tr, pr, tr->removeNode);
+  testInsertRestoreBIG(tr, pr, tr->removeNode, tr->insertNode);
+}
+
+/*
+static void myfwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+  size_t  
+    bytes_written = fwrite(ptr, size, nmemb, stream);
+
+  assert(bytes_written == nmemb);
+}
+
+static void myfread(void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+  size_t
+    bytes_read;
+
+  bytes_read = fread(ptr, size, nmemb, stream);
+
+  assert(bytes_read == nmemb);
+}
+
+static void readTree(pllInstance *tr, partitionList *pr, FILE *f)
+{
+  int 
+    nodeNumber,   
+    x = tr->mxtips + 3 * (tr->mxtips - 1);
+
+  nodeptr
+    startAddress;
+
+  myfread(&nodeNumber, sizeof(int), 1, f);
+
+  tr->start = tr->nodep[nodeNumber];
+
+
+  myfread(&startAddress, sizeof(nodeptr), 1, f);
+
+  myfread(tr->nodeBaseAddress, sizeof(node), x, f);
+
+  {
+    int i;    
+
+    size_t         
+      offset;
+
+    pllBoolean 
+      addIt;
+
+    if(startAddress > tr->nodeBaseAddress)
+    {
+      addIt = PLL_FALSE;
+      offset = (size_t)startAddress - (size_t)tr->nodeBaseAddress;
+    }
+    else
+    {
+      addIt = PLL_TRUE;
+      offset = (size_t)tr->nodeBaseAddress - (size_t)startAddress;
+    }       
+
+    for(i = 0; i < x; i++)
+    {      	
+      if(addIt)
+      {	    
+        tr->nodeBaseAddress[i].next = (nodeptr)((size_t)tr->nodeBaseAddress[i].next + offset);	
+        tr->nodeBaseAddress[i].back = (nodeptr)((size_t)tr->nodeBaseAddress[i].back + offset);
+      }
+      else
+      {
+
+        tr->nodeBaseAddress[i].next = (nodeptr)((size_t)tr->nodeBaseAddress[i].next - offset);	
+        tr->nodeBaseAddress[i].back = (nodeptr)((size_t)tr->nodeBaseAddress[i].back - offset);	   
+      } 
+    }
+
+  }
+
+  pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+  printBothOpen("RAxML Restart with likelihood: %1.50f\n", tr->likelihood);
+}
+
+static void readCheckpoint(pllInstance *tr, partitionList *pr)
+{
+  int  
+    restartErrors = 0,
+                  model; 
+
+  FILE 
+    *f = myfopen(binaryCheckpointInputName, "r");
+*/
+  /* cdta */   
+/*
+  myfread(&(tr->ckp), sizeof(checkPointState), 1, f);
+
+
+
+  if(tr->ckp.searchConvergenceCriterion != tr->searchConvergenceCriterion)
+  {
+    printf("restart error, you are trying to re-start a run where the ML search criterion was turned %s\n", (tr->ckp.searchConvergenceCriterion)?"ON":"OFF");
+    restartErrors++;
+  }  
+
+  if(tr->ckp.rateHetModel !=  tr->rateHetModel)
+  {
+    printf("restart error, you are trying to re-start a run with a different model of rate heterogeneity, the checkpoint was obtained under: %s\n", (tr->ckp.rateHetModel == PLL_GAMMA)?"GAMMA":"PSR");
+    restartErrors++;
+  }  
+
+  if(tr->ckp.maxCategories !=  tr->maxCategories)
+  {
+    printf("restart error, you are trying to re-start a run with %d per-site rate categories, the checkpoint was obtained with: %d\n", tr->maxCategories, tr->ckp.maxCategories);
+    restartErrors++;
+  }
+
+  if(tr->ckp.NumberOfModels != pr->numberOfPartitions)
+  {
+    printf("restart error, you are trying to re-start a run with %d partitions, the checkpoint was obtained with: %d partitions\n", (int)pr->numberOfPartitions, tr->ckp.NumberOfModels);
+    restartErrors++;      
+  }
+
+  if(tr->ckp.numBranches != pr->perGeneBranchLengths?pr->numberOfPartitions:1)
+  {
+    printf("restart error, you are trying to re-start a run where independent per-site branch length estimates were turned %s\n", (tr->ckp.numBranches > 1)?"ON":"OFF");
+    restartErrors++;
+  }
+
+  if(tr->ckp.originalCrunchedLength != tr->originalCrunchedLength)
+  {
+    printf("restart error, you are trying to re-start a run with %d site patterns, the checkpoint was obtained with: %d site patterns\n", tr->ckp.originalCrunchedLength, tr->originalCrunchedLength);
+    restartErrors++; 
+  }
+
+  if(tr->ckp.mxtips != tr->mxtips)
+  {
+    printf("restart error, you are trying to re-start a run with %d taxa, the checkpoint was obtained with: %d taxa\n", tr->mxtips, tr->ckp.mxtips);
+    restartErrors++; 
+  }
+
+  if(strcmp(tr->ckp.seq_file, seq_file) != 0)
+  {
+    printf("restart error, you are trying to re-start from alignemnt file %s, the checkpoint was obtained with file: %s\n", tr->ckp.seq_file, seq_file);
+    restartErrors++; 
+  }
+
+  printf("REstart errors: %d\n", restartErrors);
+
+  if(restartErrors > 0)
+  {
+    printf("User induced errors with the restart from checkpoint, exiting ...\n");
+
+    if(restartErrors > 4)
+      printf(" ... maybe you should do field work instead of trying to use a computer ...\n");
+    if(restartErrors > 6)
+      printf(" ... kala eisai telios ilithios;\n");
+
+    exit(-1);
+  }
+
+  tr->ntips = tr->mxtips;
+
+  tr->startLH    = tr->ckp.tr_startLH;
+  tr->endLH      = tr->ckp.tr_endLH;
+  tr->likelihood = tr->ckp.tr_likelihood;
+  tr->bestOfNode = tr->ckp.tr_bestOfNode;
+
+  tr->lhCutoff   = tr->ckp.tr_lhCutoff;
+  tr->lhAVG      = tr->ckp.tr_lhAVG;
+  tr->lhDEC      = tr->ckp.tr_lhDEC;
+  tr->itCount    = tr->ckp.tr_itCount;
+  tr->thoroughInsertion       = tr->ckp.tr_thoroughInsertion;
+
+
+
+  accumulatedTime = tr->ckp.accumulatedTime;
+*/
+  /* printf("Accumulated time so far: %f\n", accumulatedTime); */
+/*
+  tr->optimizeRateCategoryInvocations = tr->ckp.tr_optimizeRateCategoryInvocations;
+
+
+  myfread(tr->tree0, sizeof(char), tr->treeStringLength, f);
+  myfread(tr->tree1, sizeof(char), tr->treeStringLength, f);
+
+  if(tr->searchConvergenceCriterion)
+  {
+    int bCounter = 0;
+
+    if((tr->ckp.state == PLL_FAST_SPRS && tr->ckp.fastIterations > 0) ||
+        (tr->ckp.state == PLL_SLOW_SPRS && tr->ckp.thoroughIterations > 0))
+    { 
+
+#ifdef _DEBUG_CHECKPOINTING    
+      printf("parsing Tree 0\n");
+#endif
+
+      treeReadTopologyString(tr->tree0, tr);   
+
+      bitVectorInitravSpecial(tr->bitVectors, tr->nodep[1]->back, tr->mxtips, tr->vLength, tr->h, 0, PLL_BIPARTITIONS_RF, (branchInfo *)NULL,
+          &bCounter, 1, PLL_FALSE, PLL_FALSE, tr->threadID);
+
+      assert(bCounter == tr->mxtips - 3);
+    }
+
+    bCounter = 0;
+
+    if((tr->ckp.state == PLL_FAST_SPRS && tr->ckp.fastIterations > 1) ||
+        (tr->ckp.state == PLL_SLOW_SPRS && tr->ckp.thoroughIterations > 1))
+    {
+
+#ifdef _DEBUG_CHECKPOINTING
+      printf("parsing Tree 1\n");
+#endif
+
+      treeReadTopologyString(tr->tree1, tr); 
+
+      bitVectorInitravSpecial(tr->bitVectors, tr->nodep[1]->back, tr->mxtips, tr->vLength, tr->h, 1, PLL_BIPARTITIONS_RF, (branchInfo *)NULL,
+          &bCounter, 1, PLL_FALSE, PLL_FALSE, tr->threadID);
+
+      assert(bCounter == tr->mxtips - 3);
+    }
+  }
+
+  myfread(tr->rateCategory, sizeof(int), tr->originalCrunchedLength, f);
+  myfread(tr->patrat, sizeof(double), tr->originalCrunchedLength, f);
+  myfread(tr->patratStored, sizeof(double), tr->originalCrunchedLength, f);
+
+*/
+  /* need to read this as well in checkpoints, otherwise the branch lengths 
+     in the output tree files will be wrong, not the internal branch lengths though */
+/*
+  //TODO: Same problem as writing the checkpoint
+  //myfread(tr->fracchanges,  sizeof(double), pr->numberOfPartitions, f);
+  myfread(&(tr->fracchange),   sizeof(double), 1, f);
+*/
+  /* pInfo */
+/*
+  for(model = 0; model < pr->numberOfPartitions; model++)
+  {
+    int 
+      dataType = pr->partitionData[model]->dataType;
+
+    myfread(&(pr->partitionData[model]->numberOfCategories), sizeof(int), 1, f);
+    myfread(pr->partitionData[model]->perSiteRates, sizeof(double), tr->maxCategories, f);
+    myfread(pr->partitionData[model]->EIGN, sizeof(double), pLengths[dataType].eignLength, f);
+    myfread(pr->partitionData[model]->EV, sizeof(double),  pLengths[dataType].evLength, f);
+    myfread(pr->partitionData[model]->EI, sizeof(double),  pLengths[dataType].eiLength, f);
+
+    myfread(pr->partitionData[model]->frequencies, sizeof(double),  pLengths[dataType].frequenciesLength, f);
+    myfread(pr->partitionData[model]->tipVector, sizeof(double),  pLengths[dataType].tipVectorLength, f);
+    myfread(pr->partitionData[model]->substRates, sizeof(double),  pLengths[dataType].substRatesLength, f);
+    myfread(&(pr->partitionData[model]->alpha), sizeof(double), 1, f);
+    
+    if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+	{
+	  int 
+	    k;
+	  
+	  for(k = 0; k < 4; k++)
+	    {
+	      myfread(pr->partitionData[model]->EIGN_LG4[k], sizeof(double), pLengths[dataType].eignLength, f);
+	      myfread(pr->partitionData[model]->EV_LG4[k], sizeof(double),  pLengths[dataType].evLength, f);
+	      myfread(pr->partitionData[model]->EI_LG4[k], sizeof(double),  pLengths[dataType].eiLength, f);    
+	      myfread(pr->partitionData[model]->frequencies_LG4[k], sizeof(double),  pLengths[dataType].frequenciesLength, f);
+	      myfread(pr->partitionData[model]->tipVector_LG4[k], sizeof(double),  pLengths[dataType].tipVectorLength, f);  
+	      myfread(pr->partitionData[model]->substRates_LG4[k], sizeof(double),  pLengths[dataType].substRatesLength, f);    
+	    }
+	}
+
+    pllMakeGammaCats(pr->partitionData[model]->alpha, pr->partitionData[model]->gammaRates, 4, tr->useMedian);
+  }
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  pllMasterBarrier (tr, pr, PLL_THREAD_COPY_INIT_MODEL);
+#endif
+
+  updatePerSiteRates(tr, pr, PLL_FALSE);
+
+  readTree(tr, pr, f);
+
+  fclose(f); 
+
+}
+
+void restart(pllInstance *tr, partitionList *pr)
+{  
+  readCheckpoint(tr, pr);
+
+  switch(tr->ckp.state)
+  {
+    case PLL_REARR_SETTING:      
+      break;
+    case PLL_FAST_SPRS:
+      break;
+    case PLL_SLOW_SPRS:
+      break;
+    default:
+      assert(0);
+  }
+}
+*/
+
+/* The number of maximum smoothing iterations is given explicitely */
+/** @brief Optimize branch lenghts and evaluate likelihood of topology
+    
+    Optimize the branch lengths \a maxSmoothIterations times and evaluate
+    the likelihood of tree. The resulting likelihood is placed in
+    \a tr->likelihood
+
+    @param tr
+      The PLL instance
+
+    @param pr
+      List of partitions
+
+    @param maxSmoothIterations
+      Number of times to optimize branch lengths
+*/
+void
+pllOptimizeBranchLengths (pllInstance *tr, partitionList *pr, int maxSmoothIterations)       /* Evaluate a user tree */
+{
+  smoothTree(tr, pr, maxSmoothIterations); /* former (32 * smoothFactor) */
+
+  pllEvaluateLikelihood (tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+}
+
+/** @brief Perform an NNI move
+
+    Modify the tree topology of instance \a tr by performing an NNI (Neighbour Neighbor
+    Interchange) move at node \a p. Let \a q be \a p->back. If \a swap is set to \b PLL_NNI_P_NEXT 
+    then the subtrees rooted at \a p->next->back and \a q->next->back will be swapped. Otherwise,
+    if \a swap is set to \b PLL_NNI_P_NEXTNEXT then the subtrees rooted at \a p->next->next->back and
+    \a q->next->back are swapped. For clarity, see the illustration.
+
+    @param tr
+      PLL instance
+
+    @param p
+      Node to use as origin for performing NNI
+
+    @param swap
+      Which node to use for the NNI move. \b PLL_NNI_P_NEXT uses node p->next while \b PLL_NNI_P_NEXTNEXT uses p->next->next
+
+    @return
+      In case of success \b PLL_TRUE, otherwise \b PLL_FALSE
+
+    @todo
+      Started error checking here. Instead of checking the errors in the specified way, implement a variadic
+      function where we pass the results of each check and the error code we want to assign if there is at
+      least one negative result
+
+    @image html nni.png "In case \a swap is set to \b PLL_NNI_P_NEXT then the dashed red edge between \a p and \a r is removed and the blue edges are created. If \a swap is set to \b PLL_INIT_P_NEXTNEXT then the dashed red edge between \a p and \a s is removed and the green edges are created. In both cases the black dashed edge is removed"
+*/
+int pllTopologyPerformNNI(pllInstance * tr, nodeptr p, int swap)
+{
+  nodeptr       q, r;
+
+  q = p->back;
+  if (isTip(q->number, tr->mxtips))
+   {
+     errno = PLL_NNI_Q_TIP;
+     return (PLL_FALSE);
+   }
+  if (isTip(p->number, tr->mxtips))
+   {
+     errno = PLL_NNI_P_TIP;
+     return (PLL_FALSE);
+   }
+  assert(!isTip(q->number, tr->mxtips));
+  assert(!isTip(p->number, tr->mxtips));
+
+
+  if(swap == PLL_NNI_P_NEXT)
+   {
+     r = p->next->back;
+     hookupFull(p->next, q->next->back, q->next->z);
+     hookupFull(q->next, r,             p->next->z);
+   }
+  else
+   {
+     r = p->next->next->back;
+     hookupFull(p->next->next, q->next->back, q->next->z);
+     hookupFull(q->next,       r,             p->next->next->z);
+   }
+
+  return PLL_TRUE;
+}
+
+/** @brief Compares 2 NNI moves */
+static int cmp_nni(const void* nni1, const void* nni2) {
+	nniMove* myNNI1 = (nniMove*) nni1;
+	nniMove* myNNI2 = (nniMove*) nni2;
+	return (int) (1000000.f * myNNI1->deltaLH - 1000000.f * myNNI2->deltaLH);
+}
+
+/** @brief Gets the best NNI move for a branch
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Node to use as origin for performing NNI
+
+    @param curLH
+      The current likelihood
+
+    @return
+      The best NNI move
+
+*/
+nniMove getBestNNIForBran(pllInstance* tr, partitionList *pr, nodeptr p,
+		double curLH) {
+	nodeptr q = p->back;
+	assert( ! isTip(p->number, tr->mxtips));
+	assert( ! isTip(q->number, tr->mxtips));
+#ifdef _DEBUG_NNI
+	pllTreeToNewick(tr->tree_string, tr, tr->start->back, TRUE, FALSE, 0, 0, 0, SUMMARIZE_LH, 0,0);
+	fprintf(stderr, "%s\n", tr->tree_string);
+#endif
+
+	/* Backup the current branch length */
+	double z0[PLL_NUM_BRANCHES];
+	int i;
+	for (i = 0; i < pr->numberOfPartitions; i++) {
+		z0[i] = p->z[i];
+	}
+#ifdef _DEBUG_NNI
+	double lhOld = tr->likelihood;
+	printf("lhOld: %f \n", lhOld);
+#endif
+	double lh0 = curLH;
+
+
+#ifdef _DEBUG_NNI
+	printf("lh0: %f \n", lh0);
+#endif
+	nniMove nni0; // nni0 means no NNI move is done
+	nni0.p = p;
+	nni0.nniType = 0;
+	nni0.deltaLH = 0;
+	for (i = 0; i < pr->numberOfPartitions; i++) {
+		nni0.z[i] = p->z[i];
+	}
+
+	/* Save the scaling factor */
+	// Now try to do an NNI move of type 1
+	pllTopologyPerformNNI(tr, p, PLL_NNI_P_NEXT);
+	double lh1 = tr->likelihood;
+	/* Update branch lengths */
+	pllUpdatePartials(tr, pr, p, PLL_FALSE);
+	pllUpdatePartials(tr, pr, q, PLL_FALSE);
+	update(tr, pr, p);
+	pllEvaluateLikelihood (tr, pr, p, PLL_FALSE, PLL_FALSE);
+
+	nniMove nni1;
+	nni1.p = p;
+	nni1.nniType = 1;
+	// Store the optimized und unoptimized central branch length
+	for (i = 0; i < pr->numberOfPartitions; i++) {
+		nni1.z[i] = p->z[i];
+		nni1.z0[i] = z0[i];
+	}
+	nni1.likelihood = lh1;
+	nni1.deltaLH = lh1 - lh0;
+#ifdef _DEBUG_NNI
+	printf("Delta likelihood of the 1.NNI move: %f\n", nni1.deltaLH);
+#endif
+
+	/* Restore previous NNI move */
+	pllTopologyPerformNNI(tr, p, PLL_NNI_P_NEXT);
+	/* Restore the old branch length */
+	for (i = 0; i < pr->numberOfPartitions; i++) {
+		p->z[i] = z0[i];
+		p->back->z[i] = z0[i];
+	}
+
+#ifdef _DEBUG_NNI
+	printf("Restore topology\n");
+	pllTreeToNewick(tr->tree_string, tr, tr->start->back, TRUE, FALSE, 0, 0, 0, SUMMARIZE_LH, 0,0);
+	fprintf(stderr, "%s\n", tr->tree_string);
+	pllEvaluateLikelihood (tr, tr->start, TRUE);
+	printf("Likelihood after restoring from NNI 1: %f\n", tr->likelihood);
+#endif
+	/* Try to do an NNI move of type 2 */
+	pllTopologyPerformNNI(tr, p, 2);
+	double lh2 = tr->likelihood;
+	/* Update branch lengths */
+	pllUpdatePartials(tr, pr, p, PLL_FALSE);
+	pllUpdatePartials(tr, pr, q, PLL_FALSE);
+	update(tr, pr, p);
+	pllEvaluateLikelihood (tr, pr, p, PLL_FALSE, PLL_FALSE);
+
+	// Create the nniMove struct to store this move
+	nniMove nni2;
+	nni2.p = p;
+	nni2.nniType = 2;
+
+	// Store the optimized and unoptimized central branch length
+	for (i = 0; i < pr->numberOfPartitions; i++) {
+		nni2.z[i] = p->z[i];
+		nni2.z0[i] = z0[i];
+	}
+	nni2.likelihood = lh2;
+	nni2.deltaLH = lh2 - lh0;
+#ifdef _DEBUG_NNI
+	printf("Delta likelihood of the 2.NNI move: %f\n", nni2.deltaLH);
+#endif
+
+	/* Restore previous NNI move */
+	pllTopologyPerformNNI(tr, p, 2);
+	pllUpdatePartials(tr, pr, p, PLL_FALSE);
+	pllUpdatePartials(tr, pr, p->back, PLL_FALSE);
+	/* Restore the old branch length */
+	for (i = 0; i < pr->numberOfPartitions; i++) {
+		p->z[i] = z0[i];
+		p->back->z[i] = z0[i];
+	}
+	if (nni1.deltaLH > 0 && nni1.deltaLH >= nni2.deltaLH) {
+		return nni1;
+	} else if (nni1.deltaLH > 0 && nni1.deltaLH < nni2.deltaLH) {
+		return nni2;
+	} else if (nni1.deltaLH < 0 && nni2.deltaLH > 0) {
+		return nni2;
+	} else {
+		return nni0;
+	}
+}
+
+/** @brief ??? Not sure */
+void evalNNIForSubtree(pllInstance* tr, partitionList *pr, nodeptr p,
+		nniMove* nniList, int* cnt, int* cnt_nni, double curLH) {
+	if (!isTip(p->number, tr->mxtips)) {
+		nniList[*cnt] = getBestNNIForBran(tr, pr, p, curLH);
+		if (nniList[*cnt].deltaLH != 0.0) {
+			*cnt_nni = *cnt_nni + 1;
+		}
+		*cnt = *cnt + 1;
+		nodeptr q = p->next;
+		while (q != p) {
+			evalNNIForSubtree(tr, pr, q->back, nniList, cnt, cnt_nni, curLH);
+			q = q->next;
+		}
+	}
+}
+
+/** @brief Perform an NNI search
+
+    Modify the tree topology of instance and model parameters \a tr by performing a NNI (Neighbour Neighbor
+    Interchange) moves \a p.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param estimateModel
+      Determine wheter the model parameters should be optimized
+
+    @return
+      In case of success \b PLL_TRUE, otherwise \b PLL_FALSE
+
+*/
+int pllNniSearch(pllInstance * tr, partitionList *pr, int estimateModel) {
+
+	double curScore = tr->likelihood;
+
+	/* Initialize the NNI list */
+	nniMove* nniList = (nniMove*) malloc((tr->mxtips - 3) * sizeof(nniMove));
+	int i;
+	/* fill up the NNI list */
+	nodeptr p = tr->start->back;
+	nodeptr q = p->next;
+	int cnt = 0; // number of visited internal branches during NNI evaluation
+	int cnt_nni = 0; // number of positive NNI found
+	while (q != p) {
+		evalNNIForSubtree(tr, pr, q->back, nniList, &cnt, &cnt_nni, curScore);
+		q = q->next;
+	}
+	if (cnt_nni == 0)
+		return 0.0;
+
+	nniMove* impNNIList = (nniMove*) malloc(cnt_nni * sizeof(nniMove));
+	int j = 0;
+	for (i = 0; i < tr->mxtips - 3; i++) {
+		if (nniList[i].deltaLH > 0.0) {
+			impNNIList[j] = nniList[i];
+			j++;
+		}
+	}
+	// sort impNNIList
+	qsort(impNNIList, cnt_nni, sizeof(nniMove), cmp_nni);
+
+	// creating a list of non-conflicting positive NNI
+	nniMove* nonConfNNIList = (nniMove*) calloc(cnt_nni, sizeof(nniMove));
+
+	// the best NNI will always be taken
+	nonConfNNIList[0] = impNNIList[cnt_nni - 1];
+
+	// Filter out conflicting NNI
+	int numNonConflictNNI = 1; // size of the non-conflicting NNI list;
+	int k;
+	for (k = cnt_nni - 2; k >= 0; k--) {
+		int conflict = PLL_FALSE;
+		int j;
+		for (j = 0; j < numNonConflictNNI; j++) {
+			if (impNNIList[k].p->number == nonConfNNIList[j].p->number
+					|| impNNIList[k].p->number
+							== nonConfNNIList[j].p->back->number) {
+				conflict = PLL_TRUE;
+				break;
+			}
+		}
+		if (conflict) {
+			continue;
+		} else {
+			nonConfNNIList[numNonConflictNNI] = impNNIList[k];
+			numNonConflictNNI++;
+		}
+	}
+
+	// Applying non-conflicting NNI moves
+	double delta = 1.0; // portion of NNI moves to apply
+	int notImproved;
+	do {
+		notImproved = PLL_FALSE;
+		int numNNI2Apply = ceil(numNonConflictNNI * delta);
+		for (i = 0; i < numNNI2Apply; i++) {
+			// Just do the topological change
+			pllTopologyPerformNNI(tr, nonConfNNIList[i].p, nonConfNNIList[i].nniType);
+			pllUpdatePartials(tr, pr, nonConfNNIList[i].p, PLL_FALSE);
+			pllUpdatePartials(tr, pr, nonConfNNIList[i].p->back, PLL_FALSE);
+			// Apply the store branch length
+			int j;
+			for (j = 0; j < pr->numberOfPartitions; j++) {
+				nonConfNNIList[i].p->z[j] = nonConfNNIList[i].z[j];
+				nonConfNNIList[i].p->back->z[j] = nonConfNNIList[i].z[j];
+			}
+		}
+		// Re-optimize all branches
+		smoothTree(tr, pr, 2);
+		pllEvaluateLikelihood (tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+		if (estimateModel) {
+			modOpt(tr, pr, 0.1);
+		}
+		pllEvaluateLikelihood (tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+		if (tr->likelihood < curScore) {
+#ifdef _DEBUG_NNI
+			printf("Tree likelihood gets worse after applying NNI\n");
+			printf("curScore = %30.20f\n", curScore);
+			printf("newScore = %30.20f\n", tr->likelihood);
+			printf("Rolling back the tree\n");
+#endif
+			for (i = 0; i < numNNI2Apply; i++) {
+				pllTopologyPerformNNI(tr, nonConfNNIList[i].p, nonConfNNIList[i].nniType);
+				// Restore the branch length
+				int j;
+				for (j = 0; j < pr->numberOfPartitions; j++) {
+					nonConfNNIList[i].p->z[j] = nonConfNNIList[i].z0[j];
+					nonConfNNIList[i].p->back->z[j] = nonConfNNIList[i].z0[j];
+				}
+			}
+			pllEvaluateLikelihood (tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+#ifdef _DEBUG_NNI
+			printf("Tree likelihood after rolling back = %f \n",
+					tr->likelihood);
+#endif
+			notImproved = PLL_TRUE & (numNNI2Apply > 1);
+			delta = delta * 0.5;
+		}
+	} while (notImproved);
+	free(nniList);
+	free(impNNIList);
+	free(nonConfNNIList);
+
+	return PLL_TRUE;
+}
+
+
+/** @defgroup rearrangementGroup Topological rearrangements
+    
+    This set of functions handles the rearrangement of the tree topology
+*/
+
+
+/** @ingroup rearrangementGroup
+    @brief Create a list for storing topology rearrangements
+ 
+    Allocates space and initializes a structure that will hold information
+    of \a max topological rearrangements
+
+    @param max
+      Maximum number of elements that the structure should hold
+    
+    @note This should be called for creating a storage space (list) for
+    routines such as ::pllRearrangeSearch which compute the best NNI/PR/TBR rearrangements.
+*/
+pllRearrangeList * pllCreateRearrangeList (int max)
+{
+  pllRearrangeList * bl;
+
+  bl = (pllRearrangeList *) malloc (sizeof (pllRearrangeList));
+
+  bl->max_entries = max;
+  bl->entries     = 0;
+  bl->rearr       = (pllRearrangeInfo *) malloc (max * sizeof (pllRearrangeInfo));
+
+  return bl;
+}
+
+/** @ingroup rearrangementGroup
+    @brief Deallocator for topology rearrangements list
+    
+    Call this to destroy (deallocate) the memory taken by the \a bestList which holds
+    topological rearrangements
+
+    @param bestList
+      Pointer to the list to be deallocated
+*/
+void pllDestroyRearrangeList (pllRearrangeList ** bestList)
+{
+  pllRearrangeList * bl;
+
+  bl = *bestList;
+
+  rax_free (bl->rearr);
+  rax_free (bl);
+
+  *bestList = NULL;
+}
+
+
+/** @ingroup rearrangementGroup
+    @brief Store a rearrangement move to the list of best rearrangement moves
+
+     Checks if the likelihood yielded by the rearrangement move described in \a rearr
+     is better than any in the sorted list \a bestList. If it is, or
+     if there is still space in \a bestList, the info about the
+     move is inserted in the list.
+
+     @param bestList
+       The list of information about the best rearrangement moves
+
+     @param rearr
+       Info about the current rearrangement move
+
+     @return
+       Returns \b PLL_FALSE if the rearrangement move doesn't make it in the list, otherwise \b PLL_TRUE
+*/
+static int pllStoreRearrangement (pllRearrangeList * bestList, pllRearrangeInfo * rearr)
+ {
+   /* naive implementation of saving rearrangement moves */
+   int i;
+
+   for (i = 0; i < bestList->entries; ++ i)
+    {
+      /* Does the new rearrangement yield a better likelihood that the current in the list */
+      if (rearr->likelihood > bestList->rearr[i].likelihood)
+       {
+         /* is there enough space in the array ? */
+         if (bestList->entries < bestList->max_entries)
+          {
+            /* slide the entries to the right and overwrite the i-th element with the new item */
+            memmove (&(bestList->rearr[i + 1]), &(bestList->rearr[i]), (bestList->entries - i ) * sizeof (pllRearrangeInfo));
+            ++ bestList->entries;
+          }
+         else
+          {
+            memmove (&(bestList->rearr[i + 1]), &(bestList->rearr[i]), (bestList->entries - i - 1 ) * sizeof (pllRearrangeInfo));
+          }
+         memcpy (&(bestList->rearr[i]), rearr, sizeof (pllRearrangeInfo));
+         return (PLL_TRUE);
+       }
+    }
+   if (bestList->entries < bestList->max_entries)
+    {
+      memcpy (&(bestList->rearr[bestList->entries]), rearr, sizeof (pllRearrangeInfo));
+      ++ bestList->entries;
+      return (PLL_TRUE);
+    }
+
+   return (PLL_FALSE);
+ }
+
+/** @ingroup rearrangementGroup
+    @brief Internal function for testing and saving an SPR move
+    
+    Checks the likelihood of the placement of the pruned subtree specified by \a p
+    to node \a q. If the likelihood is better than some in the sorted list 
+    \a bestList, or if there is still free space in \a bestList, then the SPR 
+    move is recorded (in \a bestList)
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Root of the subtree that is to be pruned
+
+    @param q
+      Where to place the pruned subtree (between \a q and \a q->back
+
+    @param bestList
+      Where to store the SPR move
+
+    @note Internal function which is not part of the PLL API and therefore should not be
+    called by the user
+
+    @return
+*/
+static int
+pllTestInsertBIG (pllInstance * tr, partitionList * pr, nodeptr p, nodeptr q, pllRearrangeList * bestList)
+{
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+  pllRearrangeInfo rearr;
+
+  double  qz[PLL_NUM_BRANCHES], pz[PLL_NUM_BRANCHES];
+  nodeptr  r;
+  //double startLH = tr->endLH;
+  int i;
+
+  r = q->back; 
+  for(i = 0; i < numBranches; i++)
+  {
+    qz[i] = q->z[i];
+    pz[i] = p->z[i];
+  }
+
+  if (! insertBIG(tr, pr, p, q))       return PLL_FALSE;
+
+  pllEvaluateLikelihood (tr, pr, p->next->next, PLL_FALSE, PLL_FALSE);
+  
+  rearr.rearrangeType  = PLL_REARRANGE_SPR;
+  rearr.likelihood     = tr->likelihood;
+  rearr.SPR.removeNode = p;
+  rearr.SPR.insertNode = q;
+  for (i = 0; i < numBranches; ++ i)
+   {
+     rearr.SPR.zqr[i] = tr->zqr[i];
+   }
+
+  pllStoreRearrangement (bestList, &rearr);
+
+/*
+  if(tr->likelihood > tr->bestOfNode)
+  {
+    pllStoreRearrangement (bestList, rearr)
+    tr->bestOfNode = tr->likelihood;
+    tr->insertNode = q;
+    tr->removeNode = p;   
+    for(i = 0; i < numBranches; i++)
+    {
+      tr->currentZQR[i] = tr->zqr[i];           
+      tr->currentLZR[i] = tr->lzr[i];
+      tr->currentLZQ[i] = tr->lzq[i];
+      tr->currentLZS[i] = tr->lzs[i];      
+    }
+  }
+
+  if(tr->likelihood > tr->endLH)
+  {			  
+    
+    tr->insertNode = q;
+    tr->removeNode = p;   
+    for(i = 0; i < numBranches; i++)
+      tr->currentZQR[i] = tr->zqr[i];      
+    tr->endLH = tr->likelihood;                      
+  }        
+*/
+  /* reset the topology so that it is the same as it was before calling insertBIG */
+  hookup(q, r, qz, numBranches);
+
+  p->next->next->back = p->next->back = (nodeptr) NULL;
+
+  if(tr->thoroughInsertion)
+  {
+    nodeptr s = p->back;
+    hookup(p, s, pz, numBranches);
+  } 
+
+/*
+  if((tr->doCutoff) && (tr->likelihood < startLH))
+  {
+    tr->lhAVG += (startLH - tr->likelihood);
+    tr->lhDEC++;
+    if((startLH - tr->likelihood) >= tr->lhCutoff)
+      return PLL_FALSE;	    
+    else
+      return PLL_TRUE;
+  }
+  else
+    return PLL_TRUE;
+  */
+  return (PLL_TRUE);
+}
+
+/** @ingroup rearrangementGroup
+    @brief Internal function for recursively traversing a tree and testing a possible subtree insertion
+
+    Recursively traverses the tree rooted at \a q in the direction of \a q->next->back and \a q->next->next->back
+    and at each node tests the placement of the pruned subtree rooted at \a p by calling the function
+    \a pllTestInsertBIG, which in turn saves the computed SPR in \a bestList if a) there is still space in
+    the \a bestList or b) if the likelihood of the SPR is better than any of the ones in \a bestList.
+
+    @note This function is not part of the API and should not be called by the user.
+*/
+static void pllTraverseUpdate (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q, int mintrav, int maxtrav, pllRearrangeList * bestList)
+{  
+  if (--mintrav <= 0) 
+  {              
+    if (! pllTestInsertBIG(tr, pr, p, q, bestList))  return;
+
+  }
+
+  if ((!isTip(q->number, tr->mxtips)) && (--maxtrav > 0)) 
+  {    
+    pllTraverseUpdate(tr, pr, p, q->next->back, mintrav, maxtrav, bestList);
+    pllTraverseUpdate(tr, pr, p, q->next->next->back, mintrav, maxtrav, bestList);
+  }
+} 
+
+
+/** @ingroup rearrangementGroup
+    @brief Internal function for computing SPR moves
+
+    Compute a list of at most \a max SPR moves that can be performed by pruning
+    the subtree rooted at node \a p and testing all possible placements in a
+    radius of at least \a mintrav nodes and at most \a maxtrav nodes from \a p.
+    Note that \a tr->thoroughInsertion affects the behaviour of the function (see note).
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Node specifying the root of the pruned subtree, i.e. where to prune.
+
+    @param mintrav
+      Minimum distance from \a p where to try inserting the pruned subtree
+
+    @param maxtrav
+      Maximum distance from \a p where to try inserting the pruned subtree
+
+    @param bestList
+      The list of best topological rearrangements
+
+    @note This function is not part of the API and should not be called by the user
+    as it is called internally by the API function \a pllComputeSPR. 
+    Also, setting \a tr->thoroughInsertion affects this function. For each tested SPR
+    the new branch lengths will also be optimized. This computes better likelihoods
+    but also slows down the method considerably.
+*/
+static int pllTestSPR (pllInstance * tr, partitionList * pr, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList)
+{
+  nodeptr 
+    p1, p2, q, q1, q2;
+  double 
+    p1z[PLL_NUM_BRANCHES], p2z[PLL_NUM_BRANCHES], q1z[PLL_NUM_BRANCHES], q2z[PLL_NUM_BRANCHES];
+  int
+    mintrav2, i;
+  int numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+
+  if (maxtrav < 1 || mintrav > maxtrav) return (PLL_FALSE);
+  q = p->back;
+
+  if (!isTip (p->number, tr->mxtips))
+   {
+     p1 = p->next->back;
+     p2 = p->next->next->back;
+
+     if (!isTip (p1->number, tr->mxtips) || !isTip (p2->number, tr->mxtips))
+      {
+        /* save branch lengths before splitting the tree in two components */
+        for (i = 0; i < numBranches; ++ i)
+         {
+           p1z[i] = p1->z[i];
+           p2z[i] = p2->z[i];
+         }
+
+        /* split the tree in two components */
+        if (! removeNodeBIG (tr, pr, p, numBranches)) return PLL_BADREAR;
+
+        /* recursively traverse and perform SPR on the subtree rooted at p1 */
+        if (!isTip (p1->number, tr->mxtips))
+         {
+           pllTraverseUpdate (tr, pr, p, p1->next->back,       mintrav, maxtrav, bestList);
+           pllTraverseUpdate (tr, pr, p, p1->next->next->back, mintrav, maxtrav, bestList);
+         }
+
+        /* recursively traverse and perform SPR on the subtree rooted at p2 */
+        if (!isTip (p2->number, tr->mxtips))
+         {
+           pllTraverseUpdate (tr, pr, p, p2->next->back,       mintrav, maxtrav, bestList);
+           pllTraverseUpdate (tr, pr, p, p2->next->next->back, mintrav, maxtrav, bestList);
+         }
+
+        /* restore the topology as it was before the split */
+        hookup (p->next,       p1, p1z, numBranches);
+        hookup (p->next->next, p2, p2z, numBranches);
+        pllUpdatePartials (tr, pr, p, PLL_FALSE);
+      }
+   }
+
+  if (!isTip (q->number, tr->mxtips) && maxtrav > 0)
+   {
+     q1 = q->next->back;
+     q2 = q->next->next->back;
+
+    /* why so many conditions? Why is it not analogous to the previous if for node p? */
+    if (
+        (
+         ! isTip(q1->number, tr->mxtips) && 
+         (! isTip(q1->next->back->number, tr->mxtips) || ! isTip(q1->next->next->back->number, tr->mxtips))
+        )
+        ||
+        (
+         ! isTip(q2->number, tr->mxtips) && 
+         (! isTip(q2->next->back->number, tr->mxtips) || ! isTip(q2->next->next->back->number, tr->mxtips))
+        )
+       )
+     {
+       for (i = 0; i < numBranches; ++ i)
+        {
+          q1z[i] = q1->z[i];
+          q2z[i] = q2->z[i];
+        }
+
+       if (! removeNodeBIG (tr, pr, q, numBranches)) return PLL_BADREAR;
+
+       mintrav2 = mintrav > 2 ? mintrav : 2;
+
+       if (!isTip (q1->number, tr->mxtips))
+        {
+          pllTraverseUpdate (tr, pr, q, q1->next->back,       mintrav2, maxtrav, bestList);
+          pllTraverseUpdate (tr, pr, q, q1->next->next->back, mintrav2, maxtrav, bestList);
+        }
+
+       if (!isTip (q2->number, tr->mxtips))
+        {
+          pllTraverseUpdate (tr, pr, q, q2->next->back,       mintrav2, maxtrav, bestList);
+          pllTraverseUpdate (tr, pr, q, q2->next->next->back, mintrav2, maxtrav, bestList);
+        }
+
+       hookup (q->next,       q1, q1z, numBranches);
+       hookup (q->next->next, q2, q2z, numBranches);
+       pllUpdatePartials (tr, pr, q, PLL_FALSE);
+     }
+   }
+  return (PLL_TRUE);
+}
+
+/** @ingroup rearrangementGroup
+    @brief Compute a list of possible SPR moves
+    
+    Iteratively tries all possible SPR moves that can be performed by
+    pruning the subtree rooted at \a p and testing all possible placements
+    in a radius of at least \a mintrav nodea and at most \a maxtrav nodes from
+    \a p. Note that \a tr->thoroughInsertion affects the behaviour of the function (see note).
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Node specifying the root of the pruned subtree, i.e. where to prune.
+
+    @param mintrav
+      Minimum distance from \a p where to try inserting the pruned subtree
+
+    @param maxtrav
+      Maximum distance from \a p where to try inserting the pruned subtree
+
+    @note
+      Setting \a tr->thoroughInsertion affects this function. For each tested SPR
+      the new branch lengths will also be optimized. This computes better likelihoods
+      but also slows down the method considerably.
+*/
+static void 
+pllComputeSPR (pllInstance * tr, partitionList * pr, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList)
+{
+
+  tr->startLH = tr->endLH = tr->likelihood;
+
+  /* TODO: Add cutoff code */
+
+  tr->bestOfNode = PLL_UNLIKELY;
+
+  pllTestSPR (tr, pr, p, mintrav, maxtrav, bestList);
+}
+
+/** @ingroup rearrangementGroup
+    @brief Return the yielded likelihood of an NNI move, without altering the topology
+
+    This function performs the NNI move of type \a swapType at node \a p, optimizes
+    the branch with endpoints \a p  and \a p->back and evalutes the resulting likelihood.
+    It then restores the topology  to the origin and returns the likelihood that the NNI
+    move yielded.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Where to perform the NNI move
+
+    @param swapType
+      What type of NNI move to perform
+
+    @return
+      The likelihood yielded from the NNI
+*/
+static double 
+pllTestNNILikelihood (pllInstance * tr, partitionList * pr, nodeptr p, int swapType)
+{
+  double lh;
+  double z0[PLL_NUM_BRANCHES];
+  int i;
+
+  /* store the origin branch lengths and likelihood. The original branch lengths could
+  be passed as a parameter in order to avoid duplicate computations because of the two
+  NNI moves */
+  for (i = 0; i < pr->numberOfPartitions; ++ i)
+   {
+     z0[i] = p->z[i];
+   }
+
+  /* perform NNI */
+  pllTopologyPerformNNI(tr, p, swapType);
+  /* recompute the likelihood vectors of the two subtrees rooted at p and p->back,
+     optimize the branch lengths and evaluate the likelihood  */
+  pllUpdatePartials (tr, pr, p,       PLL_FALSE);
+  pllUpdatePartials (tr, pr, p->back, PLL_FALSE);
+  update (tr, pr, p);
+  pllEvaluateLikelihood (tr, pr, p, PLL_FALSE, PLL_FALSE);
+  lh = tr->likelihood;
+
+  /* restore topology */
+  pllTopologyPerformNNI(tr, p, swapType);
+  pllUpdatePartials (tr, pr, p,       PLL_FALSE);
+  pllUpdatePartials (tr, pr, p->back, PLL_FALSE);
+  //update (tr, pr, p);
+  pllEvaluateLikelihood (tr, pr, p, PLL_FALSE, PLL_FALSE);
+  for (i = 0; i < pr->numberOfPartitions; ++ i)
+   {
+     p->z[i] = p->back->z[i] = z0[i];
+   }
+
+  return lh;
+}
+/** @ingroup rearrangementGroup
+    @brief Compares NNI likelihoods at a node and store in the rearrangement list
+
+    Compares the two possible NNI moves that can be performed at node \a p, and
+    if the likelihood improves from the one of the original topology, then 
+    it picks the one that yields the highest likelihood and tries to insert it in
+    the list of best rearrangement moves
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param bestList
+      Rearrangement moves list
+*/
+static void pllTestNNI (pllInstance * tr, partitionList * pr, nodeptr p, pllRearrangeList * bestList)
+{
+  double lh0, lh1, lh2;
+  pllRearrangeInfo rearr;
+
+  /* store the original likelihood */
+  lh0 = tr->likelihood;
+
+  lh1 = pllTestNNILikelihood (tr, pr, p, PLL_NNI_P_NEXT);
+  lh2 = pllTestNNILikelihood (tr, pr, p, PLL_NNI_P_NEXTNEXT);
+
+  if (lh0 > lh1 && lh0 > lh2) return;
+
+  /* set the arrangement structure */
+  rearr.rearrangeType  = PLL_REARRANGE_NNI;
+  rearr.likelihood     = PLL_MAX (lh1, lh2);
+  rearr.NNI.originNode = p;
+  rearr.NNI.swapType   = (lh1 > lh2) ? PLL_NNI_P_NEXT : PLL_NNI_P_NEXTNEXT;
+
+  /* try to store it in the best list */
+  pllStoreRearrangement (bestList, &rearr);
+}
+
+/** @ingroup rearrangementGroup
+    @brief Recursive traversal of the tree structure for testing NNI moves
+ 
+    Recursively traverses the tree structure and tests all allowed NNI
+    moves in the area specified by \a mintrav and \a maxtrav. For more
+    information and details on the function arguments check ::pllSearchNNI
+*/
+static void 
+pllTraverseNNI (pllInstance *tr, partitionList *pr, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList)
+{
+  if (isTip (p->number, tr->mxtips)) return;
+
+  /* if we are at the right radius then compute the NNIs for nodes p->next and p->next->next */
+  if (!mintrav)
+   {
+     pllTestNNI (tr, pr, p->next, bestList);
+     pllTestNNI (tr, pr, p->next->next, bestList);
+   }
+  
+  /* and then avoid computing the NNIs for nodes p->next->back and p->next->next->back as they are
+  the same to the ones computed in the above two lines. This way we do not need to resolve conflicts
+  later on as in the old code */
+  if (maxtrav)
+   {
+     if (!isTip (p->next->back->number, tr->mxtips))       
+       pllTraverseNNI (tr, pr, p->next->back,       mintrav ? mintrav - 1 : 0, maxtrav - 1, bestList);
+     if (!isTip (p->next->next->back->number, tr->mxtips)) 
+       pllTraverseNNI (tr, pr, p->next->next->back, mintrav ? mintrav - 1 : 0, maxtrav - 1, bestList);
+   }
+}
+
+/** @ingroup rearrangementGroup
+    @brief Compute a list of possible NNI moves
+    
+    Iteratively tries all possible NNI moves at each node that is at
+    least \a mintrav and at most \a maxtrav nodes far from node \a p.
+    At each NNI move, the likelihood is tested and if it is higher than
+    the likelihood of an element in the sorted (by likelihood) list 
+    \a bestList, or if there is still empty space in \a bestList, it is
+    inserted at the corresponding position.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Node specifying the point where the NNI will be performed.
+
+    @param mintrav
+      Minimum distance from \a p where the NNI can be tested 
+
+    @param maxtrav
+      Maximum distance from \a p where to try NNIs
+*/
+static void
+pllSearchNNI (pllInstance * tr, partitionList * pr, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList)
+{
+  /* avoid conflicts by precomputing the NNI of the first node */
+
+  if (mintrav == 0) 
+  pllTestNNI (tr, pr, p, bestList);
+  
+  pllTraverseNNI (tr, pr, p, mintrav, maxtrav, bestList);
+  if (maxtrav)
+    pllTraverseNNI (tr, pr, p->back, mintrav, maxtrav - 1, bestList);
+
+}
+
+/** @ingroup rearrangementGroup
+    @brief Create rollback information for an SPR move
+    
+    Creates a structure of type ::pllRollbackInfo and fills it with rollback
+    information about the SPR move described in \a rearr. The rollback info
+    is stored in the PLL instance in a LIFO manner.
+
+    @param tr
+      PLL instance
+
+    @param rearr
+      Description of the SPR move
+
+    @param numBranches
+      Number of partitions
+*/
+static void 
+pllCreateSprInfoRollback (pllInstance * tr, pllRearrangeInfo * rearr, int numBranches)
+{
+  pllRollbackInfo * sprRb;
+  nodeptr p, q;
+  int i;
+
+  p = rearr->SPR.removeNode;
+  q = rearr->SPR.insertNode;
+
+  sprRb = (pllRollbackInfo *) rax_malloc (sizeof (pllRollbackInfo) + 4 * numBranches * sizeof (double));
+  sprRb->SPR.zp   = (double *) ((char *)sprRb + sizeof (pllRollbackInfo));
+  sprRb->SPR.zpn  = (double *) ((char *)sprRb + sizeof (pllRollbackInfo) + numBranches * sizeof (double));
+  sprRb->SPR.zpnn = (double *) ((char *)sprRb + sizeof (pllRollbackInfo) + 2 * numBranches * sizeof (double));
+  sprRb->SPR.zqr  = (double *) ((char *)sprRb + sizeof (pllRollbackInfo) + 3 * numBranches * sizeof (double));
+
+  for (i = 0; i < numBranches; ++ i)
+   {
+     sprRb->SPR.zp[i]   = p->z[i];
+     sprRb->SPR.zpn[i]  = p->next->z[i];
+     sprRb->SPR.zpnn[i] = p->next->next->z[i];
+     sprRb->SPR.zqr[i]  = q->z[i];
+   }
+
+  sprRb->SPR.pn  = p->next->back;
+  sprRb->SPR.pnn = p->next->next->back;
+  sprRb->SPR.r   = q->back;
+  sprRb->SPR.q   = q;
+  sprRb->SPR.p   = p;
+
+  sprRb->rearrangeType = PLL_REARRANGE_SPR;
+
+  pllStackPush (&(tr->rearrangeHistory), (void *) sprRb);
+}
+
+/** @ingroup rearrangementGroup
+    @brief Create rollback information for an NNI move
+
+    Creates a structure of type ::pllRollbackInfo and fills it with rollback
+    information about the SPR move described in \a rearr. The rollback info
+    is stored in the PLL instance in a LIFO manner
+
+    @param tr
+      PLL instance
+
+    @param rearr
+      Description of the NNI move
+*/
+static void
+pllCreateNniInfoRollback (pllInstance * tr, pllRearrangeInfo * rearr)
+{
+  /*TODO: add the branches ? */
+  pllRollbackInfo * ri;
+
+  ri = (pllRollbackInfo *) rax_malloc (sizeof (pllRollbackInfo));
+
+  ri->rearrangeType = PLL_REARRANGE_NNI;
+
+  ri->NNI.origin   = rearr->NNI.originNode;
+  ri->NNI.swapType = rearr->NNI.swapType;
+
+  pllStackPush (&(tr->rearrangeHistory), (void *) ri);
+  
+}
+
+
+/** @ingroup rearrangementGroup
+    @brief Generic function for creating rollback information
+
+    Creates a structure of type ::pllRollbackInfo and fills it with rollback
+    information about the move described in \a rearr. The rollback info
+    is stored in the PLL instance in a LIFO manner
+
+    @param tr
+      PLL instance
+
+    @param rearr
+      Description of the NNI move
+
+    @param numBranches
+      Number of partitions
+*/
+static void
+pllCreateRollbackInfo (pllInstance * tr, pllRearrangeInfo * rearr, int numBranches)
+{
+  switch (rearr->rearrangeType)
+   {
+     case PLL_REARRANGE_NNI:
+       pllCreateNniInfoRollback (tr, rearr);
+       break;
+     case PLL_REARRANGE_SPR:
+       pllCreateSprInfoRollback (tr, rearr, numBranches);
+       break;
+     default:
+       break;
+   }
+
+}
+
+
+/** @ingroup rearrangementGroup
+    @brief Rollback an SPR move
+
+    Perform a rollback (undo) on the last SPR move.
+    
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param ri
+      Rollback information
+*/
+static void
+pllRollbackSPR (partitionList * pr, pllRollbackInfo * ri)
+{
+  int numBranches;
+
+  numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+
+  hookup (ri->SPR.p->next,       ri->SPR.pn,      ri->SPR.zpn,  numBranches);
+  hookup (ri->SPR.p->next->next, ri->SPR.pnn,     ri->SPR.zpnn, numBranches); 
+  hookup (ri->SPR.p,             ri->SPR.p->back, ri->SPR.zp,   numBranches);
+  hookup (ri->SPR.q,             ri->SPR.r,       ri->SPR.zqr,  numBranches);
+
+  rax_free (ri);
+}
+
+/** @ingroup rearrangementGroup
+    @brief Rollback an NNI move
+
+    Perform a rollback (undo) on the last NNI move.
+    
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param ri
+      Rollback information
+*/
+static void
+pllRollbackNNI (pllInstance * tr, partitionList * pr, pllRollbackInfo * ri)
+{
+  nodeptr p = ri->NNI.origin;
+
+  pllTopologyPerformNNI(tr, p, ri->NNI.swapType);
+  pllUpdatePartials (tr, pr, p,       PLL_FALSE);
+  pllUpdatePartials (tr, pr, p->back, PLL_FALSE);
+  update (tr, pr, p);
+  pllEvaluateLikelihood (tr, pr, p, PLL_FALSE, PLL_FALSE);
+  
+  rax_free (ri);
+}
+
+/** @ingroup rearrangementGroup
+    @brief Rollback the last committed rearrangement move
+    
+    Perform a rollback (undo) on the last committed rearrangement move.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @return
+      Returns \b PLL_TRUE is the rollback was successful, otherwise \b PLL_FALSE
+      (if no rollback was done)
+*/
+int 
+pllRearrangeRollback (pllInstance * tr, partitionList * pr)
+{
+  pllRollbackInfo * ri;
+  
+  ri = (pllRollbackInfo *) pllStackPop (&(tr->rearrangeHistory));
+  if (!ri) return (PLL_FALSE);
+
+  switch (ri->rearrangeType)
+   {
+     case PLL_REARRANGE_NNI:
+       pllRollbackNNI (tr, pr, ri);
+       break;
+     case PLL_REARRANGE_SPR:
+       pllRollbackSPR (pr, ri);
+       break;
+     default:
+       rax_free (ri);
+       return (PLL_FALSE);
+   }
+
+  return (PLL_TRUE);
+  
+}
+
+
+/** @ingroup rearrangementGroup
+    @brief Commit a rearrangement move
+
+    Applies the rearrangement move specified in \a rearr to the tree topology in \a tr. 
+    In case of SPR moves, if
+    \a tr->thoroughInsertion is set to \b PLL_TRUE, the new branch lengths are also optimized. 
+    The function stores rollback information in pllInstance::rearrangeHistory if \a saveRollbackInfo
+    is set to \b PLL_TRUE. This way, the rearrangement move can be rolled back (undone) by calling
+    ::pllRearrangeRollback
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param rearr
+      An element of a \a pllRearrangeInfo structure that contains information about the rearrangement move
+
+    @param saveRollbackInfo
+      If set to \b PLL_TRUE, rollback info will be kept for undoing the rearrangement move
+*/
+void
+pllRearrangeCommit (pllInstance * tr, partitionList * pr, pllRearrangeInfo * rearr, int saveRollbackInfo)
+{
+  int numBranches;
+
+  numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+
+  if (saveRollbackInfo)
+    pllCreateRollbackInfo (tr, rearr, numBranches);
+
+  switch (rearr->rearrangeType)
+   {
+     case PLL_REARRANGE_NNI:
+       pllTopologyPerformNNI(tr, rearr->NNI.originNode, rearr->NNI.swapType);
+       pllUpdatePartials (tr, pr, rearr->NNI.originNode, PLL_FALSE);
+       pllUpdatePartials (tr, pr, rearr->NNI.originNode->back, PLL_FALSE);
+       update (tr, pr, rearr->NNI.originNode);
+       pllEvaluateLikelihood (tr, pr, rearr->NNI.originNode, PLL_FALSE, PLL_FALSE);
+       break;
+     case PLL_REARRANGE_SPR:
+       removeNodeBIG (tr, pr, rearr->SPR.removeNode, numBranches);
+       insertBIG     (tr, pr, rearr->SPR.removeNode, rearr->SPR.insertNode);
+       break;
+     default:
+       break;
+   }
+}
+
+
+/******** new rearrangement functions ****************/
+
+/* change this to return the number of new elements in the list */
+/** @ingroup rearrangementGroup
+    @brief Search for rearrangement topologies
+    
+    Search for possible rearrangement moves of type \a rearrangeType in the
+    annular area defined by the minimal resp. maximal radii \a mintrav resp.
+    \a maxtrav. If the resulting likelihood is better than the current, try
+    to insert the move specification in \a bestList, which is a sorted list
+    that holds the rearrange info of the best moves sorted by likelihood
+    (desccending order).
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param rearrangeType
+      Type of rearrangement. Can be \b PLL_REARRANGE_SPR or \b PLL_REARRANGE_NNI
+
+    @param p
+      Point of origin, i.e. where to start searching from
+
+    @param mintrav
+      The minimal radius of the annulus
+
+    @param maxtrav
+      The maximal radius of the annulus
+
+    @param bestList
+      List that holds the details of the best rearrangement moves found
+
+    @note
+      If \a bestList is not empty, the existing entries will not be altered unless
+      better rearrangement moves (that means yielding better likelihood) are found
+      and the list is full, in which case the entries with the worst likelihood will be
+      thrown away.
+*/
+void
+pllRearrangeSearch (pllInstance * tr, partitionList * pr, int rearrangeType, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList)
+{
+  switch (rearrangeType)
+   {
+     case PLL_REARRANGE_SPR:
+       pllComputeSPR (tr, pr, p, mintrav, maxtrav, bestList);
+       break;
+
+     case PLL_REARRANGE_NNI:
+       pllSearchNNI (tr, pr, p, mintrav, maxtrav, bestList);
+       break;
+
+     case PLL_REARRANGE_TBR:
+       break;
+     default:
+       break;
+   }
+}
+
+
+static int
+determineRearrangementSetting(pllInstance *tr, partitionList *pr,
+    bestlist *bestT, bestlist *bt)
+{
+  int i, mintrav, maxtrav, bestTrav, impr, index, MaxFast, *perm = (int*) NULL;
+  double startLH;
+  pllBoolean cutoff;
+
+  MaxFast = 26;
+
+  startLH = tr->likelihood;
+
+  cutoff = tr->doCutoff;
+  tr->doCutoff = PLL_FALSE;
+
+  mintrav = 1;
+  maxtrav = 5;
+
+  bestTrav = maxtrav = 5;
+
+  impr = 1;
+
+  resetBestTree(bt);
+
+  if (tr->permuteTreeoptimize)
+    {
+      int n = tr->mxtips + tr->mxtips - 2;
+      perm = (int *) rax_malloc(sizeof(int) * (n + 1));
+      makePermutation(perm, n, tr);
+    }
+
+  while (impr && maxtrav < MaxFast)
+    {
+      recallBestTree(bestT, 1, tr, pr);
+      nodeRectifier(tr);
+
+      if (maxtrav > tr->ntips - 3)
+        maxtrav = tr->ntips - 3;
+
+      tr->startLH = tr->endLH = tr->likelihood;
+
+      for (i = 1; i <= tr->mxtips + tr->mxtips - 2; i++)
+        {
+
+          if (tr->permuteTreeoptimize)
+            index = perm[i];
+          else
+            index = i;
+
+          tr->bestOfNode = PLL_UNLIKELY;
+          if (rearrangeBIG(tr, pr, tr->nodep[index], mintrav, maxtrav))
+            {
+              if (tr->endLH > tr->startLH)
+                {
+                  restoreTreeFast(tr, pr);
+                  tr->startLH = tr->endLH = tr->likelihood;
+                }
+            }
+        }
+
+      pllOptimizeBranchLengths(tr, pr, 8);
+      saveBestTree(bt, tr,
+          pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+
+      if (tr->likelihood > startLH)
+        {
+          startLH = tr->likelihood;
+          bestTrav = maxtrav;
+          impr = 1;
+        }
+      else
+        {
+          impr = 0;
+        }
+      maxtrav += 5;
+
+      if (tr->doCutoff)
+        {
+          tr->lhCutoff = (tr->lhAVG) / ((double) (tr->lhDEC));
+
+          tr->itCount = tr->itCount + 1;
+          tr->lhAVG = 0;
+          tr->lhDEC = 0;
+        }
+    }
+
+  recallBestTree(bt, 1, tr, pr);
+  tr->doCutoff = cutoff;
+
+  if (tr->permuteTreeoptimize)
+    rax_free(perm);
+
+  return bestTrav;
+}
+
+
+static void hash_dealloc_bipentry (void * entry)
+{
+  pllBipartitionEntry * e = (pllBipartitionEntry *)entry;
+
+  if(e->bitVector)     rax_free(e->bitVector);
+  if(e->treeVector)    rax_free(e->treeVector);
+  if(e->supportVector) rax_free(e->supportVector);
+
+}
+
+/** @ingroup rearrangementGroup
+    @brief RAxML algorithm for ML search
+
+    RAxML algorithm for searching the Maximum Likelihood tree and model.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param estimateModel
+      If true, model parameters are optimized in a ML framework.
+
+    @note
+      For datasets with a large number of taxa, setting tr->searchConvergenceCriterion to
+    PLL_TRUE can improve the execution time in up to 50% looking for topology convergence.
+*/
+int
+pllRaxmlSearchAlgorithm(pllInstance * tr, partitionList * pr,
+    pllBoolean estimateModel)
+{
+  pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+  pllOptimizeBranchLengths(tr, pr, 32);
+
+  unsigned int vLength = 0;
+  int i, impr, bestTrav, rearrangementsMax = 0, rearrangementsMin = 0,
+      thoroughIterations = 0, fastIterations = 0;
+
+  double lh, previousLh, difference, epsilon;
+  bestlist *bestT, *bt;
+  infoList iList;
+  pllOptimizeBranchLengths(tr, pr, 32);
+
+  pllHashTable *h = NULL;
+  //hashtable *h = NULL;
+  unsigned int **bitVectors = (unsigned int**) NULL;
+
+  /* Security check... These variables might have not been initialized! */
+  if (tr->stepwidth == 0) tr->stepwidth = 5;
+  if (tr->max_rearrange == 0) tr->max_rearrange = 21;
+
+  if (tr->searchConvergenceCriterion)
+    {
+      bitVectors = initBitVector(tr->mxtips, &vLength);
+      //h = initHashTable(tr->mxtips * 4);
+      h = pllHashInit (tr->mxtips * 4);
+    }
+
+  bestT = (bestlist *) rax_malloc(sizeof(bestlist));
+  bestT->ninit = 0;
+  initBestTree(bestT, 1, tr->mxtips);
+
+  bt = (bestlist *) rax_malloc(sizeof(bestlist));
+  bt->ninit = 0;
+  initBestTree(bt, 20, tr->mxtips);
+
+  initInfoList(&iList, 50);
+
+  difference = 10.0;
+  epsilon = tr->likelihoodEpsilon;
+
+  tr->thoroughInsertion = 0;
+
+  if (estimateModel)
+    {
+      pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+      pllOptimizeModelParameters(tr, pr, 10.0);
+    }
+  else
+    pllOptimizeBranchLengths(tr, pr, 64);
+
+  saveBestTree(bestT, tr,
+      pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+
+  if (!tr->initialSet)
+    bestTrav = tr->bestTrav = determineRearrangementSetting(tr, pr, bestT, bt);
+  else
+    bestTrav = tr->bestTrav = tr->initial;
+
+  if (estimateModel)
+    {
+      pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+      pllOptimizeModelParameters(tr, pr, 5.0);
+    }
+  else
+    pllOptimizeBranchLengths(tr, pr, 32);
+
+  saveBestTree(bestT, tr,
+      pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+  impr = 1;
+  if (tr->doCutoff)
+    tr->itCount = 0;
+
+  while (impr)
+    {
+      recallBestTree(bestT, 1, tr, pr);
+
+      if (tr->searchConvergenceCriterion)
+        {
+          int bCounter = 0;
+
+          if (fastIterations > 1)
+            cleanupHashTable(h, (fastIterations % 2));
+
+          bitVectorInitravSpecial(bitVectors, tr->nodep[1]->back, tr->mxtips,
+              vLength, h, fastIterations % 2, PLL_BIPARTITIONS_RF,
+              (branchInfo *) NULL, &bCounter, 1, PLL_FALSE, PLL_FALSE, 0);
+
+          assert(bCounter == tr->mxtips - 3);
+
+          if (fastIterations > 0)
+            {
+              double rrf = convergenceCriterion(h, tr->mxtips);
+
+              if (rrf <= 0.01) /* 1% cutoff */
+                {
+                  cleanupHashTable(h, 0);
+                  cleanupHashTable(h, 1);
+                  goto cleanup_fast;
+                }
+            }
+        }
+
+      fastIterations++;
+
+      pllOptimizeBranchLengths(tr, pr, 32);
+
+      saveBestTree(bestT, tr,
+          pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+
+      lh = previousLh = tr->likelihood;
+
+      treeOptimizeRapid(tr, pr, 1, bestTrav, bt, &iList);
+
+      impr = 0;
+
+      for (i = 1; i <= bt->nvalid; i++)
+        {
+          recallBestTree(bt, i, tr, pr);
+
+          pllOptimizeBranchLengths(tr, pr, 8);
+
+          difference = (
+              (tr->likelihood > previousLh) ?
+                  tr->likelihood - previousLh : previousLh - tr->likelihood);
+          if (tr->likelihood > lh && difference > epsilon)
+            {
+              impr = 1;
+              lh = tr->likelihood;
+              saveBestTree(bestT, tr,
+                  pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+            }
+        }
+    }
+
+  if (tr->searchConvergenceCriterion)
+    {
+      cleanupHashTable(h, 0);
+      cleanupHashTable(h, 1);
+    }
+
+  cleanup_fast:
+
+  tr->thoroughInsertion = 1;
+  impr = 1;
+
+  recallBestTree(bestT, 1, tr, pr);
+  if (estimateModel)
+    {
+      pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+      pllOptimizeModelParameters(tr, pr, 1.0);
+    }
+  else
+    pllOptimizeBranchLengths(tr, pr, 32);
+
+  while (1)
+    {
+      recallBestTree(bestT, 1, tr, pr);
+      if (impr)
+        {
+          rearrangementsMin = 1;
+          rearrangementsMax = tr->stepwidth;
+
+          if (tr->searchConvergenceCriterion)
+            {
+              int bCounter = 0;
+
+              if (thoroughIterations > 1)
+                cleanupHashTable(h, (thoroughIterations % 2));
+
+              bitVectorInitravSpecial(bitVectors, tr->nodep[1]->back,
+                  tr->mxtips, vLength, h, thoroughIterations % 2,
+                  PLL_BIPARTITIONS_RF, (branchInfo *) NULL, &bCounter, 1,
+                  PLL_FALSE, PLL_FALSE, 0);
+
+              assert(bCounter == tr->mxtips - 3);
+
+              if (thoroughIterations > 0)
+                {
+                  double rrf = convergenceCriterion(h, tr->mxtips);
+
+                  if (rrf <= 0.01) /* 1% cutoff */
+                    {
+                      goto cleanup;
+                    }
+                }
+            }
+
+          thoroughIterations++;
+        }
+      else
+        {
+          rearrangementsMax += tr->stepwidth;
+          rearrangementsMin += tr->stepwidth;
+          if (rearrangementsMax > tr->max_rearrange)
+            goto cleanup;
+        }
+      pllOptimizeBranchLengths(tr, pr, 32);
+
+      previousLh = lh = tr->likelihood;
+      saveBestTree(bestT, tr,
+          pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+
+      treeOptimizeRapid(tr, pr, rearrangementsMin, rearrangementsMax, bt,
+          &iList);
+
+      impr = 0;
+
+      for (i = 1; i <= bt->nvalid; i++)
+        {
+          recallBestTree(bt, i, tr, pr);
+
+          pllOptimizeBranchLengths(tr, pr, 8);
+
+          difference = (
+              (tr->likelihood > previousLh) ?
+                  tr->likelihood - previousLh : previousLh - tr->likelihood);
+          if (tr->likelihood > lh && difference > epsilon)
+            {
+              impr = 1;
+              lh = tr->likelihood;
+              saveBestTree(bestT, tr,
+                  pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+            }
+        }
+
+    }
+
+  cleanup:
+  if (tr->searchConvergenceCriterion)
+    {
+      freeBitVectors(bitVectors, 2 * tr->mxtips);
+      rax_free(bitVectors);
+      //freeHashTable(h);
+      //rax_free(h);
+      pllHashDestroy(&h, hash_dealloc_bipentry);
+    }
+
+  freeBestTree(bestT);
+  rax_free(bestT);
+  freeBestTree(bt);
+  rax_free(bt);
+
+  freeInfoList(&iList);
+
+  if (estimateModel) {
+      pllOptimizeModelParameters(tr, pr, epsilon);
+  }
+  pllOptimizeBranchLengths(tr, pr, 64);
+
+  return 0;
+}
+
diff --git a/pll/semaphore.h b/pll/semaphore.h
new file mode 100644
index 0000000..c6e9407
--- /dev/null
+++ b/pll/semaphore.h
@@ -0,0 +1,169 @@
+/*
+ * Module: semaphore.h
+ *
+ * Purpose:
+ *	Semaphores aren't actually part of the PThreads standard.
+ *	They are defined by the POSIX Standard:
+ *
+ *		POSIX 1003.1b-1993	(POSIX.1b)
+ *
+ * --------------------------------------------------------------------------
+ *
+ *      Pthreads-win32 - POSIX Threads Library for Win32
+ *      Copyright(C) 1998 John E. Bossom
+ *      Copyright(C) 1999,2005 Pthreads-win32 contributors
+ * 
+ *      Contact Email: rpj at callisto.canberra.edu.au
+ * 
+ *      The current list of contributors is contained
+ *      in the file CONTRIBUTORS included with the source
+ *      code distribution. The list can also be seen at the
+ *      following World Wide Web location:
+ *      http://sources.redhat.com/pthreads-win32/contributors.html
+ * 
+ *      This library is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU Lesser General Public
+ *      License as published by the Free Software Foundation; either
+ *      version 2 of the License, or (at your option) any later version.
+ * 
+ *      This library is distributed in the hope that it will be useful,
+ *      but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *      Lesser General Public License for more details.
+ * 
+ *      You should have received a copy of the GNU Lesser General Public
+ *      License along with this library in the file COPYING.LIB;
+ *      if not, write to the Free Software Foundation, Inc.,
+ *      59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+#if !defined( SEMAPHORE_H )
+#define SEMAPHORE_H
+
+#undef PTW32_SEMAPHORE_LEVEL
+
+#if defined(_POSIX_SOURCE)
+#define PTW32_SEMAPHORE_LEVEL 0
+/* Early POSIX */
+#endif
+
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309
+#undef PTW32_SEMAPHORE_LEVEL
+#define PTW32_SEMAPHORE_LEVEL 1
+/* Include 1b, 1c and 1d */
+#endif
+
+#if defined(INCLUDE_NP)
+#undef PTW32_SEMAPHORE_LEVEL
+#define PTW32_SEMAPHORE_LEVEL 2
+/* Include Non-Portable extensions */
+#endif
+
+#define PTW32_SEMAPHORE_LEVEL_MAX 3
+
+#if !defined(PTW32_SEMAPHORE_LEVEL)
+#define PTW32_SEMAPHORE_LEVEL PTW32_SEMAPHORE_LEVEL_MAX
+/* Include everything */
+#endif
+
+#if defined(__GNUC__) && ! defined (__declspec)
+# error Please upgrade your GNU compiler to one that supports __declspec.
+#endif
+
+/*
+ * When building the library, you should define PTW32_BUILD so that
+ * the variables/functions are exported correctly. When using the library,
+ * do NOT define PTW32_BUILD, and then the variables/functions will
+ * be imported correctly.
+ */
+#if !defined(PTW32_STATIC_LIB)
+#  if defined(PTW32_BUILD)
+#    define PTW32_DLLPORT __declspec (dllexport)
+#  else
+#    define PTW32_DLLPORT __declspec (dllimport)
+#  endif
+#else
+#  define PTW32_DLLPORT
+#endif
+
+/*
+ * This is a duplicate of what is in the autoconf config.h,
+ * which is only used when building the pthread-win32 libraries.
+ */
+
+#if !defined(PTW32_CONFIG_H)
+#  if defined(WINCE)
+#    define NEED_ERRNO
+#    define NEED_SEM
+#  endif
+#  if defined(__MINGW64__)
+#    define HAVE_STRUCT_TIMESPEC
+#    define HAVE_MODE_T
+#  elif defined(_UWIN) || defined(__MINGW32__)
+#    define HAVE_MODE_T
+#  endif
+#endif
+
+/*
+ *
+ */
+
+#if PTW32_SEMAPHORE_LEVEL >= PTW32_SEMAPHORE_LEVEL_MAX
+#if defined(NEED_ERRNO)
+#include "need_errno.h"
+#else
+#include <errno.h>
+#endif
+#endif /* PTW32_SEMAPHORE_LEVEL >= PTW32_SEMAPHORE_LEVEL_MAX */
+
+#define _POSIX_SEMAPHORES
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif				/* __cplusplus */
+
+#if !defined(HAVE_MODE_T)
+typedef unsigned int mode_t;
+#endif
+
+
+typedef struct sem_t_ * sem_t;
+
+PTW32_DLLPORT int __cdecl sem_init (sem_t * sem,
+			    int pshared,
+			    unsigned int value);
+
+PTW32_DLLPORT int __cdecl sem_destroy (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_trywait (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_wait (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_timedwait (sem_t * sem,
+				 const struct timespec * abstime);
+
+PTW32_DLLPORT int __cdecl sem_post (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_post_multiple (sem_t * sem,
+				     int count);
+
+PTW32_DLLPORT int __cdecl sem_open (const char * name,
+			    int oflag,
+			    mode_t mode,
+			    unsigned int value);
+
+PTW32_DLLPORT int __cdecl sem_close (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_unlink (const char * name);
+
+PTW32_DLLPORT int __cdecl sem_getvalue (sem_t * sem,
+				int * sval);
+
+#if defined(__cplusplus)
+}				/* End of extern "C" */
+#endif				/* __cplusplus */
+
+#undef PTW32_SEMAPHORE_LEVEL
+#undef PTW32_SEMAPHORE_LEVEL_MAX
+
+#endif				/* !SEMAPHORE_H */
diff --git a/pll/ssort.c b/pll/ssort.c
new file mode 100644
index 0000000..b08cbe7
--- /dev/null
+++ b/pll/ssort.c
@@ -0,0 +1,121 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file ssort.c
+ * Detailed description to appear soon.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include "mem_alloc.h"
+
+/*  string sorting implementation from:
+ *  Bentley J. L., Sedgewick R.: Fast Algorithms for Sorting and Searching 
+ *  Strings. In Proceedings of ACM-SIAM Symposium on Discrete Algorithms 
+ *  (SODA) 1997.
+ */
+
+static void 
+vecswap (int i, int j, int n, char ** x, int * oi)
+{
+  while (n-- > 0)
+   {
+     PLL_SWAP_PTR (x[i], x[j]);
+     PLL_SWAP_INT (oi[i], oi[j]);
+
+     ++ i; ++ j;
+   }
+}
+
+static void 
+ssort1 (char ** x, int n, int depth, int * oi)
+{
+  int           a, b, c, d, r, v;
+
+  if (n <= 1) return;
+
+  a = rand() % n;
+
+  PLL_SWAP_PTR (x[0], x[a]);
+  PLL_SWAP_INT (oi[0], oi[a]);
+
+  v = x[0][depth];
+
+  a = b = 1;
+  c = d = n - 1;
+
+  for (;;)
+   {
+     while (b <= c && (r = x[b][depth] - v) <= 0)
+      {
+        if (r == 0)
+         {
+           PLL_SWAP_PTR (x[a], x[b]);
+           PLL_SWAP_INT (oi[a], oi[b]);
+           ++ a;
+         }
+        ++ b;
+      }
+     while (b <= c && (r = x[c][depth] - v) >= 0)
+      {
+        if (r == 0)
+         {
+           PLL_SWAP_PTR (x[c], x[d]);
+           PLL_SWAP_INT (oi[c], oi[d]);
+           -- d;
+         }
+        -- c;
+      }
+     if (b > c) break;
+     PLL_SWAP_PTR (x[b], x[c]);
+     PLL_SWAP_INT (oi[b], oi[c]);
+     ++ b; -- c;
+   }
+  r = PLL_MIN (a,     b - a);      vecswap (0, b - r, r, x, oi);
+  r = PLL_MIN (d - c, n - d - 1);  vecswap (b, n - r, r, x, oi);
+  r = b - a; ssort1 (x, r, depth, oi);
+  if (x[r][depth] != 0)
+   {
+     ssort1 (x + r, a + n - d - 1, depth + 1, oi + r);
+   }
+  r = d - c; ssort1 (x + n - r, r, depth, oi + n - r);
+}
+
+int * 
+pllssort1main (char ** x, int n)
+{
+  int * oi;
+  int i;
+
+  oi = (int *) rax_malloc (n * sizeof (int));
+  for (i = 0; i < n; ++ i)
+   {
+     oi[i] = i;
+   }
+  ssort1 (x, n, 0, oi);
+  
+  return (oi);
+}
+
diff --git a/pll/stack.c b/pll/stack.c
new file mode 100644
index 0000000..062cf2e
--- /dev/null
+++ b/pll/stack.c
@@ -0,0 +1,85 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file stack.c
+ * @brief Generic stack implementation
+ *
+ * Detailed description to appear soon.
+ */
+#include <stdio.h>
+#include "stack.h"
+#include "mem_alloc.h"
+
+int pllStackSize (pllStack ** stack)
+{
+  pllStack * top;
+  int size = 0;
+  top = *stack;
+ 
+  while (top)
+  {
+    ++ size;
+    top = top->next;
+  }
+  
+  return (size);
+}
+
+int 
+pllStackPush (pllStack ** head, void * item)
+{
+  pllStack * new;
+ 
+  new = (pllStack *) rax_malloc (sizeof (pllStack));
+  if (!new) return (0);
+ 
+  new->item = item;
+  new->next = *head;
+  *head     = new;
+ 
+  return (1);
+}
+
+void * pllStackPop (pllStack ** head)
+{
+  void * item;
+  pllStack * tmp;
+  if (!*head) return (NULL);
+ 
+  tmp     = (*head);
+  item    = (*head)->item;
+  (*head) = (*head)->next;
+  rax_free (tmp);
+ 
+  return (item);
+}
+ 
+void 
+pllStackClear (pllStack ** stack)
+{
+  while (*stack) pllStackPop (stack);
+}
+
diff --git a/pll/stack.h b/pll/stack.h
new file mode 100644
index 0000000..2ec64bd
--- /dev/null
+++ b/pll/stack.h
@@ -0,0 +1,48 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file stack.h
+ * @brief Generic stack implementation
+ *
+ * Detailed description to appear soon.
+ */
+#ifndef __pll_STACK__
+#define __pll_STACK__
+
+struct pllStack
+{
+  void * item;
+  struct pllStack * next;
+};
+
+typedef struct pllStack pllStack;
+
+void  pllStackClear (pllStack ** stack);
+void * pllStackPop (pllStack ** head);
+int pllStackPush (pllStack ** head, void * item);
+int pllStackSize (pllStack ** stack);
+
+#endif
diff --git a/pll/topologies.c b/pll/topologies.c
new file mode 100644
index 0000000..f19bf3d
--- /dev/null
+++ b/pll/topologies.c
@@ -0,0 +1,778 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file topologies.c
+ * @brief Miscellanous functions working with tree topology
+*/
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h> 
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+static void saveTopolRELLRec(pllInstance *tr, nodeptr p, topolRELL *tpl, int *i, int numsp)
+{
+  int k;
+  if(isTip(p->number, numsp))
+    return;
+  else
+    {
+      nodeptr q = p->next;      
+      while(q != p)
+	{	  
+	  tpl->connect[*i].p = q;
+	  tpl->connect[*i].q = q->back; 
+	  
+	  if(tr->grouped ||  tr->constrained)
+	    {
+	      tpl->connect[*i].cp = tr->constraintVector[q->number];
+	      tpl->connect[*i].cq = tr->constraintVector[q->back->number]; 
+	    }
+	  
+	  for(k = 0; k < PLL_NUM_BRANCHES; k++)
+	    tpl->connect[*i].z[k] = q->z[k];
+	  *i = *i + 1;
+
+	  saveTopolRELLRec(tr, q->back, tpl, i, numsp);
+	  q = q->next;
+	}
+    }
+}
+
+static void saveTopolRELL(pllInstance *tr, topolRELL *tpl)
+{
+  nodeptr p = tr->start;
+  int k, i = 0;
+      
+  tpl->likelihood = tr->likelihood;
+  tpl->start      = 1;
+      
+  tpl->connect[i].p = p;
+  tpl->connect[i].q = p->back;
+  
+  if(tr->grouped ||  tr->constrained)
+    {
+      tpl->connect[i].cp = tr->constraintVector[p->number];
+      tpl->connect[i].cq = tr->constraintVector[p->back->number]; 
+    }
+
+  for(k = 0; k < PLL_NUM_BRANCHES; k++)
+    tpl->connect[i].z[k] = p->z[k];
+  i++;
+      
+  saveTopolRELLRec(tr, p->back, tpl, &i, tr->mxtips);
+
+  assert(i == 2 * tr->mxtips - 3);
+}
+
+
+static void restoreTopolRELL(pllInstance *tr, topolRELL *tpl, int numBranches)
+{
+  int i;
+  
+  for (i = 0; i < 2 * tr->mxtips - 3; i++) 
+    {
+      hookup(tpl->connect[i].p, tpl->connect[i].q, tpl->connect[i].z,  numBranches);
+      tr->constraintVector[tpl->connect[i].p->number] = tpl->connect[i].cp;
+      tr->constraintVector[tpl->connect[i].q->number] = tpl->connect[i].cq;
+    }
+  
+
+  tr->likelihood = tpl->likelihood;
+  tr->start      = tr->nodep[tpl->start];
+  /* TODO */
+}
+
+
+
+/** @brief Initializes space as large as the tree
+  *
+  * @param rl
+  *   RELL 
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param n
+  *   Number of
+  *
+  * @todo
+  *   Don't know what is this used for. Something with RELL?
+  *
+  */
+void initTL(topolRELL_LIST *rl, pllInstance *tr, int n)
+{
+  int i;
+
+  rl->max = n; 
+  rl->t = (topolRELL **)rax_malloc(sizeof(topolRELL *) * n);
+
+  for(i = 0; i < n; i++)
+    {
+      rl->t[i] = (topolRELL *)rax_malloc(sizeof(topolRELL));
+      rl->t[i]->connect = (connectRELL *)rax_malloc((2 * tr->mxtips - 3) * sizeof(connectRELL));
+      rl->t[i]->likelihood = PLL_UNLIKELY;     
+    }
+}
+
+/** @brief Deallocate the space associated with this structure
+  *
+  * @paral rl
+  *   This structure
+  *
+  * @todo
+  *   fill the description
+  */
+void freeTL(topolRELL_LIST *rl)
+{
+  int i;
+  for(i = 0; i < rl->max; i++)    
+    {
+      rax_free(rl->t[i]->connect);          
+      rax_free(rl->t[i]);
+    }
+  rax_free(rl->t);
+}
+
+
+void restoreTL(topolRELL_LIST *rl, pllInstance *tr, int n, int numBranches)
+{
+  assert(n >= 0 && n < rl->max);    
+
+  restoreTopolRELL(tr, rl->t[n], numBranches);
+}
+
+
+
+/** @brief Reset this structure
+  *
+  * Reset the likelihoods in this structure
+  *
+  * @param rl
+  *   This structure
+  *
+  * @todo
+  *   Complete this
+  */
+void resetTL(topolRELL_LIST *rl)
+{
+  int i;
+
+  for(i = 0; i < rl->max; i++)    
+    rl->t[i]->likelihood = PLL_UNLIKELY;          
+}
+
+
+
+/** @brief Save 
+  *
+  * Save this topology?
+  *
+  * @todo 
+  *  Complete this
+  */
+void saveTL(topolRELL_LIST *rl, pllInstance *tr, int index)
+{ 
+  assert(index >= 0 && index < rl->max);    
+    
+  if(tr->likelihood > rl->t[index]->likelihood)        
+    saveTopolRELL(tr, rl->t[index]); 
+}
+
+
+static void  *tipValPtr (nodeptr p)
+{ 
+  return  (void *) & p->number;
+}
+
+
+static int  cmpTipVal (void *v1, void *v2)
+{
+  int  i1, i2;
+  
+  i1 = *((int *) v1);
+  i2 = *((int *) v2);
+  return  (i1 < i2) ? -1 : ((i1 == i2) ? 0 : 1);
+}
+
+
+/*  These are the only routines that need to UNDERSTAND topologies */
+
+/** @brief Allocate and initialize space for a tree topology
+    
+    Allocate and initialize a \a topol structure for a tree topology of
+    \a maxtips tips
+
+    @param
+      Number of tips of topology
+
+    @return
+      Pointer to the allocated \a topol structure
+*/
+topol  *setupTopol (int maxtips)
+{
+  topol   *tpl;
+
+  if (! (tpl = (topol *) rax_malloc(sizeof(topol))) || 
+      ! (tpl->links = (connptr) rax_malloc((2*maxtips-3) * sizeof(pllConnect))))
+    {
+      printf("ERROR: Unable to get topology memory");
+      tpl = (topol *) NULL;
+    }
+  else 
+    {
+      tpl->likelihood  = PLL_UNLIKELY;
+      tpl->start       = (node *) NULL;
+      tpl->nextlink    = 0;
+      tpl->ntips       = 0;
+      tpl->nextnode    = 0;    
+      tpl->scrNum      = 0;     /* position in sorted list of scores */
+      tpl->tplNum      = 0;     /* position in sorted list of trees */	      
+    }
+  
+  return  tpl;
+} 
+
+
+/** @brief Deallocate the space occupied by a \a topol structure
+    
+    Deallocate the space occupied by a \a topol structure
+
+    @param tpl
+      The \a topol structure that is to be deallocated
+*/
+void freeTopol (topol *tpl)
+{
+  rax_free(tpl->links);
+  rax_free(tpl);
+} 
+
+
+static int saveSubtree (nodeptr p, topol *tpl, int numsp, int numBranches)  
+{
+  connptr  r, r0;
+  nodeptr  q, s;
+  int      t, t0, t1, k;
+
+  r0 = tpl->links;
+  r = r0 + (tpl->nextlink)++;
+  r->p = p;
+  r->q = q = p->back;
+
+  for(k = 0; k < numBranches; k++)
+    r->z[k] = p->z[k];
+
+  r->descend = 0;                     /* No children (yet) */
+
+  if (isTip(q->number, numsp)) 
+    {
+      r->valptr = tipValPtr(q);         /* Assign value */
+    }
+  else 
+    {                              /* Internal node, look at children */
+      s = q->next;                      /* First child */
+      do 
+	{
+	  t = saveSubtree(s, tpl, numsp, numBranches);        /* Generate child's subtree */
+
+	  t0 = 0;                         /* Merge child into list */
+	  t1 = r->descend;
+	  while (t1 && (cmpTipVal(r0[t1].valptr, r0[t].valptr) < 0)) {
+	    t0 = t1;
+	    t1 = r0[t1].sibling;
+          }
+	  if (t0) r0[t0].sibling = t;  else  r->descend = t;
+	  r0[t].sibling = t1;
+
+	  s = s->next;                    /* Next child */
+        } while (s != q);
+
+      r->valptr = r0[r->descend].valptr;   /* Inherit first child's value */
+      }                                 /* End of internal node processing */
+
+  return  (r - r0);
+}
+
+/** @brief Get the node with the smallest tip value
+    
+    Recursively finds and returns the tip with the smallest value around a node
+    \a p0, or returns \a p0 if it is a tip.
+
+    @param p0
+      Node around which to at which the recursion starts
+
+    @param numsp
+      Number of species (tips) in the tree
+
+    @todo
+      Why do we return p0 immediately if it is a tip? Perhaps one of the two other nodes,
+      i.e. p0->next and p0->next->next, is a tip as well with a smaller number than p0.
+*/
+static nodeptr minSubtreeTip (nodeptr  p0, int numsp)
+{ 
+  nodeptr  minTip, p, testTip;
+
+  if (isTip(p0->number, numsp)) 
+    return p0;
+
+  p = p0->next;
+
+  minTip = minSubtreeTip(p->back, numsp);
+
+  while ((p = p->next) != p0) 
+    {
+      testTip = minSubtreeTip(p->back, numsp);
+      if (cmpTipVal(tipValPtr(testTip), tipValPtr(minTip)) < 0)
+        minTip = testTip;
+    }
+  return minTip;
+} 
+
+
+/** @brief
+*/
+static nodeptr  minTreeTip (nodeptr  p, int numsp)
+{
+  nodeptr  minp, minpb;
+
+  minp  = minSubtreeTip(p, numsp);
+  minpb = minSubtreeTip(p->back, numsp);
+  return (cmpTipVal(tipValPtr(minp), tipValPtr(minpb)) < 0 ? minp : minpb);
+}
+
+/** @brief Save the tree topology in a \a topol structure
+    
+    Save the current tree topology in \a topol structure \a tpl.
+
+*/
+void saveTree (pllInstance *tr, topol *tpl, int numBranches)
+/*  Save a tree topology in a standard order so that first branches
+ *  from a node contain lower value tips than do second branches from
+ *  the node.  The root tip should have the lowest value of all.
+ */
+{
+  connptr  r;  
+  
+  tpl->nextlink = 0;                             /* Reset link pointer */
+  r = tpl->links + saveSubtree(minTreeTip(tr->start, tr->mxtips), tpl, tr->mxtips, numBranches);  /* Save tree */
+  r->sibling = 0;
+  
+  tpl->likelihood = tr->likelihood;
+  tpl->start      = tr->start;
+  tpl->ntips      = tr->ntips;
+  tpl->nextnode   = tr->nextnode;    
+  
+} /* saveTree */
+
+
+/* @brief Transform tree to a given topology and evaluate likelihood
+
+   Transform our current tree topology to the one stored in \a tpl and
+   evaluates the likelihood
+
+   @param tr
+     PLL instance
+
+   @param pr
+     List of partitions
+
+   @return
+     \b PLL_TRUE
+
+   @todo
+     Remove the return value, unnecessary
+
+*/
+pllBoolean restoreTree (topol *tpl, pllInstance *tr, partitionList *pr)
+{ 
+  connptr  r;
+  nodeptr  p, p0;    
+  int  i;
+
+  /* first of all set all backs to NULL so that tips do not point anywhere */
+  for (i = 1; i <= 2*(tr->mxtips) - 2; i++) 
+    {  
+      /* Uses p = p->next at tip */
+      p0 = p = tr->nodep[i];
+      do 
+	{
+	  p->back = (nodeptr) NULL;
+	  p = p->next;
+	} 
+      while (p != p0);
+    }
+
+  /*  Copy connections from topology */
+
+  /* then connect the nodes together */
+  for (r = tpl->links, i = 0; i < tpl->nextlink; r++, i++)     
+    hookup(r->p, r->q, r->z, pr->perGeneBranchLengths?pr->numberOfPartitions:1);
+
+  tr->likelihood = tpl->likelihood;
+  tr->start      = tpl->start;
+  tr->ntips      = tpl->ntips;
+  
+  tr->nextnode   = tpl->nextnode;    
+
+  pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+  return PLL_TRUE;
+}
+
+
+
+/** @brief Initialize a list of best trees
+    
+    Initialize a list that will contain the best \a newkeep tree topologies,
+    i.e. the ones that yield the best likelihood. Inside the list initialize
+    space for \a newkeep + 1 topologies of \a numsp tips. The additional
+    topology is the starting one
+
+    @param bt
+      Pointer to \a bestlist to be initialized
+
+    @param newkeep
+      Number of new topologies to keep
+
+    @param numsp
+      Number of species (tips)
+
+    @return
+      number of tree topology slots in the list (minus the starting one)
+
+    @todo
+      Is there a reason that this function is so complex? Many of the checks
+      are unnecessary as the function is called only at two places in the
+      code with newkeep=1 and newkeep=20
+*/
+int initBestTree (bestlist *bt, int newkeep, int numsp)
+{ /* initBestTree */
+  int  i;
+
+  bt->nkeep = 0;
+
+  if (bt->ninit <= 0) 
+    {
+      if (! (bt->start = setupTopol(numsp)))  return  0;
+      bt->ninit    = -1;
+      bt->nvalid   = 0;
+      bt->numtrees = 0;
+      bt->best     = PLL_UNLIKELY;
+      bt->improved = PLL_FALSE;
+      bt->byScore  = (topol **) rax_malloc((newkeep + 1) * sizeof(topol *));
+      bt->byTopol  = (topol **) rax_malloc((newkeep + 1) * sizeof(topol *));
+      if (! bt->byScore || ! bt->byTopol) {
+        printf( "initBestTree: malloc failure\n");
+        return 0;
+      }
+    }
+  else if (PLL_ABS(newkeep) > bt->ninit) {
+    if (newkeep <  0) newkeep = -(bt->ninit);
+    else newkeep = bt->ninit;
+  }
+
+  if (newkeep < 1) {    /*  Use negative newkeep to clear list  */
+    newkeep = -newkeep;
+    if (newkeep < 1) newkeep = 1;
+    bt->nvalid = 0;
+    bt->best = PLL_UNLIKELY;
+  }
+  
+  if (bt->nvalid >= newkeep) {
+    bt->nvalid = newkeep;
+    bt->worst = bt->byScore[newkeep]->likelihood;
+  }
+  else 
+    {
+      bt->worst = PLL_UNLIKELY;
+    }
+  
+  for (i = bt->ninit + 1; i <= newkeep; i++) 
+    {    
+      if (! (bt->byScore[i] = setupTopol(numsp)))  break;
+      bt->byTopol[i] = bt->byScore[i];
+      bt->ninit = i;
+    }
+  
+  return  (bt->nkeep = PLL_MIN(newkeep, bt->ninit));
+} /* initBestTree */
+
+
+
+void resetBestTree (bestlist *bt)
+{ /* resetBestTree */
+  bt->best     = PLL_UNLIKELY;
+  bt->worst    = PLL_UNLIKELY;
+  bt->nvalid   = 0;
+  bt->improved = PLL_FALSE;
+} /* resetBestTree */
+
+
+pllBoolean  freeBestTree(bestlist *bt)
+{ /* freeBestTree */
+  while (bt->ninit >= 0)  freeTopol(bt->byScore[(bt->ninit)--]);
+    
+  /* VALGRIND */
+
+  rax_free(bt->byScore);
+  rax_free(bt->byTopol);
+
+  /* VALGRIND END */
+
+  freeTopol(bt->start);
+  return PLL_TRUE;
+} /* freeBestTree */
+
+
+/*  Compare two trees, assuming that each is in standard order.  Return
+ *  -1 if first preceeds second, 0 if they are identical, or +1 if first
+ *  follows second in standard order.  Lower number tips preceed higher
+ *  number tips.  A tip preceeds a corresponding internal node.  Internal
+ *  nodes are ranked by their lowest number tip.
+ */
+
+static int  cmpSubtopol (connptr p10, connptr p1, connptr p20, connptr p2)
+{
+  connptr  p1d, p2d;
+  int  cmp;
+  
+  if (! p1->descend && ! p2->descend)          /* Two tips */
+    return cmpTipVal(p1->valptr, p2->valptr);
+  
+  if (! p1->descend) return -1;                /* p1 = tip, p2 = node */
+  if (! p2->descend) return  1;                /* p2 = tip, p1 = node */
+  
+  p1d = p10 + p1->descend;
+  p2d = p20 + p2->descend;
+  while (1) {                                  /* Two nodes */
+    if ((cmp = cmpSubtopol(p10, p1d, p20, p2d)))  return cmp; /* Subtrees */
+    if (! p1d->sibling && ! p2d->sibling)  return  0; /* Lists done */
+    if (! p1d->sibling) return -1;             /* One done, other not */
+    if (! p2d->sibling) return  1;             /* One done, other not */
+    p1d = p10 + p1d->sibling;                  /* Neither done */
+    p2d = p20 + p2d->sibling;
+  }
+}
+
+
+
+static int  cmpTopol (void *tpl1, void *tpl2)
+{ 
+  connptr  r1, r2;
+  int      cmp;    
+  
+  r1 = ((topol *) tpl1)->links;
+  r2 = ((topol *) tpl2)->links;
+  cmp = cmpTipVal(tipValPtr(r1->p), tipValPtr(r2->p));
+  if (cmp)      	
+    return cmp;     
+  return  cmpSubtopol(r1, r1, r2, r2);
+} 
+
+
+
+static int  cmpTplScore (void *tpl1, void *tpl2)
+{ 
+  double  l1, l2;
+  
+  l1 = ((topol *) tpl1)->likelihood;
+  l2 = ((topol *) tpl2)->likelihood;
+  return  (l1 > l2) ? -1 : ((l1 == l2) ? 0 : 1);
+}
+
+
+
+/*  Find an item in a sorted list of n items.  If the item is in the list,
+ *  return its index.  If it is not in the list, return the negative of the
+ *  position into which it should be inserted.
+ */
+
+static int  findInList (void *item, void *list[], int n, int (* cmpFunc)(void *, void *))
+{
+  int  mid, hi, lo, cmp = 0;
+  
+  if (n < 1) return  -1;                    /*  No match; first index  */
+  
+  lo = 1;
+  mid = 0;
+  hi = n;
+  while (lo < hi) {
+    mid = (lo + hi) >> 1;
+    cmp = (* cmpFunc)(item, list[mid-1]);
+    if (cmp) {
+      if (cmp < 0) hi = mid;
+      else lo = mid + 1;
+    }
+    else  return  mid;                        /*  Exact match  */
+  }
+  
+  if (lo != mid) {
+    cmp = (* cmpFunc)(item, list[lo-1]);
+    if (cmp == 0) return lo;
+  }
+  if (cmp > 0) lo++;                         /*  Result of step = 0 test  */
+  return  -lo;
+} 
+
+
+
+static int  findTreeInList (bestlist *bt, pllInstance *tr, int numBranches)
+{
+  topol  *tpl;
+  
+  tpl = bt->byScore[0];
+  saveTree(tr, tpl, numBranches);
+  return  findInList((void *) tpl, (void **) (& (bt->byTopol[1])),
+		     bt->nvalid, cmpTopol);
+} 
+
+
+/** @brief Save the current tree in the \a bestlist structure
+    
+    Save the current tree topology in \a bestlist structure \a bt.
+
+    @param tr
+      The PLL instance
+    
+    @param bt
+      The \a bestlist structure
+    
+    @param numBranches
+      Number of branches u
+
+    @return
+      it is never used
+
+    @todo
+      What to do with the return value? Should we simplify the code?
+*/
+int  saveBestTree (bestlist *bt, pllInstance *tr, int numBranches)
+{    
+  topol  *tpl, *reuse;
+  int  tplNum, scrNum, reuseScrNum, reuseTplNum, i, oldValid, newValid;
+  
+  tplNum = findTreeInList(bt, tr, numBranches);
+  tpl = bt->byScore[0];
+  oldValid = newValid = bt->nvalid;
+  
+  if (tplNum > 0) {                      /* Topology is in list  */
+    reuse = bt->byTopol[tplNum];         /* Matching topol  */
+    reuseScrNum = reuse->scrNum;
+    reuseTplNum = reuse->tplNum;
+  }
+  /* Good enough to keep? */
+  else if (tr->likelihood < bt->worst)  return 0;
+  
+  else {                                 /* Topology is not in list */
+    tplNum = -tplNum;                    /* Add to list (not replace) */
+    if (newValid < bt->nkeep) bt->nvalid = ++newValid;
+    reuseScrNum = newValid;              /* Take worst tree */
+    reuse = bt->byScore[reuseScrNum];
+    reuseTplNum = (newValid > oldValid) ? newValid : reuse->tplNum;
+    if (tr->likelihood > bt->start->likelihood) bt->improved = PLL_TRUE;
+  }
+  
+  scrNum = findInList((void *) tpl, (void **) (& (bt->byScore[1])),
+		      oldValid, cmpTplScore);
+  scrNum = PLL_ABS(scrNum);
+  
+  if (scrNum < reuseScrNum)
+    for (i = reuseScrNum; i > scrNum; i--)
+      (bt->byScore[i] = bt->byScore[i-1])->scrNum = i;
+  
+  else if (scrNum > reuseScrNum) {
+    scrNum--;
+    for (i = reuseScrNum; i < scrNum; i++)
+      (bt->byScore[i] = bt->byScore[i+1])->scrNum = i;
+  }
+  
+  if (tplNum < reuseTplNum)
+    for (i = reuseTplNum; i > tplNum; i--)
+      (bt->byTopol[i] = bt->byTopol[i-1])->tplNum = i;
+  
+  else if (tplNum > reuseTplNum) {
+    tplNum--;
+    for (i = reuseTplNum; i < tplNum; i++)
+      (bt->byTopol[i] = bt->byTopol[i+1])->tplNum = i;
+  }
+  
+  
+  
+  tpl->scrNum = scrNum;
+  tpl->tplNum = tplNum;
+  bt->byTopol[tplNum] = bt->byScore[scrNum] = tpl;
+  bt->byScore[0] = reuse;
+  
+  if (scrNum == 1)  bt->best = tr->likelihood;
+  if (newValid == bt->nkeep) bt->worst = bt->byScore[newValid]->likelihood;
+  
+  return  scrNum;
+} 
+
+
+/** @brief Restore the best tree from \a bestlist structure
+    
+    Restore the \a rank-th best tree from the \a bestlist structure \a bt.
+
+    @param bt
+      The \a bestlist structure containing the stored best trees
+
+    @param rank
+      The rank (by score) of the tree we want to retrieve
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @return
+      Index (rank) of restored topology in \a bestlist
+*/
+int  recallBestTree (bestlist *bt, int rank, pllInstance *tr, partitionList *pr)
+{ 
+  if (rank < 1)  rank = 1;
+  if (rank > bt->nvalid)  rank = bt->nvalid;
+  if (rank > 0)  if (! restoreTree(bt->byScore[rank], tr, pr)) return PLL_FALSE;
+  return  rank;
+}
+
+
+
+
diff --git a/pll/trash.c b/pll/trash.c
new file mode 100644
index 0000000..5247c25
--- /dev/null
+++ b/pll/trash.c
@@ -0,0 +1,129 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file trash.c
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h>  
+#endif
+
+#include <limits.h>
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+ 
+  
+/** @brief Reorder nodes in PLL tree
+
+    Re-order the internal nodes of the tree of PLL instance \a tr in a preorder
+    traversal such that they start from \a p
+    
+    @param tr
+      PLL instance
+
+    @param np
+      Array of node pointers
+
+    @param p
+      Node from where the preorder traversal should start
+
+    @param count
+
+    @todo
+      why not insert a break in the for loop when the node is found?
+
+*/
+static void reorderNodes(pllInstance *tr, nodeptr *np, nodeptr p, int *count)
+{
+  int i, found = 0;
+
+  if(isTip(p->number, tr->mxtips))    
+    return;
+  else
+    {              
+      for(i = tr->mxtips + 1; (i <= (tr->mxtips + tr->mxtips - 1)) && (found == 0); i++)
+	{
+	  if (p == np[i] || p == np[i]->next || p == np[i]->next->next)
+	    {
+	      if(p == np[i])			       
+		tr->nodep[*count + tr->mxtips + 1] = np[i];		 		
+	      else
+		{
+		  if(p == np[i]->next)		  
+		    tr->nodep[*count + tr->mxtips + 1] = np[i]->next;		     	   
+		  else		   
+		    tr->nodep[*count + tr->mxtips + 1] = np[i]->next->next;		    		    
+		}
+
+	      found = 1;	      	     
+	      *count = *count + 1;
+	    }
+	} 
+      
+      assert(found != 0);
+     
+      reorderNodes(tr, np, p->next->back, count);     
+      reorderNodes(tr, np, p->next->next->back, count);                
+    }
+}
+
+void nodeRectifier(pllInstance *tr)
+{
+  nodeptr *np = (nodeptr *)rax_malloc(2 * tr->mxtips * sizeof(nodeptr));
+  int i;
+  int count = 0;
+  
+  tr->start       = tr->nodep[1];
+  tr->rooted      = PLL_FALSE;
+
+  /* TODO why is tr->rooted set to PLL_FALSE here ?*/
+  
+  for(i = tr->mxtips + 1; i <= (tr->mxtips + tr->mxtips - 1); i++)
+    np[i] = tr->nodep[i];           
+  
+  reorderNodes(tr, np, tr->start->back, &count); 
+
+ 
+  rax_free(np);
+}
+
+nodeptr findAnyTip(nodeptr p, int numsp)
+{ 
+  return  isTip(p->number, numsp) ? p : findAnyTip(p->next->back, numsp);
+} 
+
diff --git a/pll/treeIO.c b/pll/treeIO.c
new file mode 100644
index 0000000..0a63b40
--- /dev/null
+++ b/pll/treeIO.c
@@ -0,0 +1,236 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file treeIO.c
+ */
+#include "mem_alloc.h"
+
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h> 
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+extern char *likelihood_key;
+extern char *ntaxa_key;
+extern char *smoothed_key;
+extern int partCount;
+
+int countTips(nodeptr p, int numsp)
+{
+  if(isTip(p->number, numsp))  
+    return 1;    
+  {
+    nodeptr q;
+    int tips = 0;
+
+    q = p->next;
+    while(q != p)
+      { 
+	tips += countTips(q->back, numsp);
+	q = q->next;
+      } 
+    
+    return tips;
+  }
+}
+
+
+static double getBranchLength(pllInstance *tr, partitionList *pr, int perGene, nodeptr p)
+{
+  double 
+    z = 0.0,
+    x = 0.0;
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  assert(perGene != PLL_NO_BRANCHES);
+	      
+  if(numBranches == 1)
+    {
+      assert(tr->fracchange != -1.0);
+      z = p->z[0];
+      if (z < PLL_ZMIN) 
+	z = PLL_ZMIN;      	 
+      
+      x = -log(z) * tr->fracchange;           
+    }
+  else
+    {
+      if(perGene == PLL_SUMMARIZE_LH)
+	{
+	  int 
+	    i;
+	  
+	  double 
+	    avgX = 0.0;
+		      
+	  for(i = 0; i < numBranches; i++)
+	    {
+	      assert(pr->partitionData[i]->partitionContribution != -1.0);
+	      assert(pr->partitionData[i]->fracchange != -1.0);
+	      z = p->z[i];
+	      if(z < PLL_ZMIN) 
+		z = PLL_ZMIN;      	 
+	      x = -log(z) * pr->partitionData[i]->fracchange;
+	      avgX += x * pr->partitionData[i]->partitionContribution;
+	    }
+
+	  x = avgX;
+	}
+      else
+	{	
+	  assert(pr->partitionData[perGene]->fracchange != -1.0);
+	  assert(perGene >= 0 && perGene < numBranches);
+	  
+	  z = p->z[perGene];
+	  
+	  if(z < PLL_ZMIN) 
+	    z = PLL_ZMIN;      	 
+	  
+	  x = -log(z) * pr->partitionData[perGene]->fracchange;
+	}
+    }
+
+  return x;
+}
+
+static char *pllTreeToNewickREC(char *treestr, pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean printBranchLengths, pllBoolean printNames,
+			    pllBoolean printLikelihood, pllBoolean rellTree, pllBoolean finalPrint, int perGene, pllBoolean branchLabelSupport, pllBoolean printSHSupport)
+{
+  char  *nameptr;            
+      
+  if(isTip(p->number, tr->mxtips)) 
+    {	       	  
+      if(printNames)
+	{
+	  nameptr = tr->nameList[p->number];     
+	  sprintf(treestr, "%s", nameptr);
+	}
+      else
+	sprintf(treestr, "%d", p->number);    
+	
+      while (*treestr) treestr++;
+    }
+  else 
+    {                 	 
+      *treestr++ = '(';
+      treestr = pllTreeToNewickREC(treestr, tr, pr, p->next->back, printBranchLengths, printNames, printLikelihood, rellTree,
+			       finalPrint, perGene, branchLabelSupport, printSHSupport);
+      *treestr++ = ',';
+      treestr = pllTreeToNewickREC(treestr, tr, pr, p->next->next->back, printBranchLengths, printNames, printLikelihood, rellTree,
+			       finalPrint, perGene, branchLabelSupport, printSHSupport);
+      if(p == tr->start->back) 
+	{
+	  *treestr++ = ',';
+	  treestr = pllTreeToNewickREC(treestr, tr, pr, p->back, printBranchLengths, printNames, printLikelihood, rellTree,
+				   finalPrint, perGene, branchLabelSupport, printSHSupport);
+	}
+      *treestr++ = ')';                    
+    }
+
+  if(p == tr->start->back) 
+    {	      	 
+      if(printBranchLengths && !rellTree)
+	sprintf(treestr, ":0.0;\n");
+      else
+	sprintf(treestr, ";\n");	 	  	
+    }
+  else 
+    {                   
+      if(rellTree || branchLabelSupport || printSHSupport)
+	{	 	 
+	  if(( !isTip(p->number, tr->mxtips)) && 
+	     ( !isTip(p->back->number, tr->mxtips)))
+	    {			      
+	      assert(p->bInf != (branchInfo *)NULL);
+	      
+	      if(rellTree)
+		sprintf(treestr, "%d:%8.20f", p->bInf->support, p->z[0]);
+	      if(branchLabelSupport)
+		sprintf(treestr, ":%8.20f[%d]", p->z[0], p->bInf->support);
+	      if(printSHSupport)
+		sprintf(treestr, ":%8.20f[%d]", getBranchLength(tr, pr, perGene, p), p->bInf->support);
+	      
+	    }
+	  else		
+	    {
+	      if(rellTree || branchLabelSupport)
+		sprintf(treestr, ":%8.20f", p->z[0]);	
+	      if(printSHSupport)
+		sprintf(treestr, ":%8.20f", getBranchLength(tr, pr, perGene, p));
+	    }
+	}
+      else
+	{
+	  if(printBranchLengths)	    
+	    sprintf(treestr, ":%8.20f", getBranchLength(tr, pr, perGene, p));
+	  else	    
+	    sprintf(treestr, "%s", "\0");	    
+	}      
+    }
+  
+  while (*treestr) treestr++;
+  return  treestr;
+}
+
+
+char *pllTreeToNewick(char *treestr, pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean printBranchLengths, pllBoolean printNames, pllBoolean printLikelihood,
+		  pllBoolean rellTree, pllBoolean finalPrint, int perGene, pllBoolean branchLabelSupport, pllBoolean printSHSupport)
+{ 
+
+  if(rellTree)
+    assert(!branchLabelSupport && !printSHSupport);
+
+  if(branchLabelSupport)
+    assert(!rellTree && !printSHSupport);
+
+  if(printSHSupport)
+    assert(!branchLabelSupport && !rellTree);
+
+ 
+  pllTreeToNewickREC(treestr, tr, pr, p, printBranchLengths, printNames, printLikelihood, rellTree,
+		 finalPrint, perGene, branchLabelSupport, printSHSupport);  
+    
+  
+  while (*treestr) treestr++;
+  
+  return treestr;
+}
+
diff --git a/pll/treeIO.h b/pll/treeIO.h
new file mode 100644
index 0000000..c91a1ab
--- /dev/null
+++ b/pll/treeIO.h
@@ -0,0 +1,23 @@
+/*
+ * treeIO.h
+ *
+ *  Created on: Nov 22, 2012
+ *      Author: tung
+ */
+
+/*
+I just put some declarations of the functions that I need here.
+Please extend this file. It's important to have a header file.
+It make things much easier for the integration with other software.
+*/
+
+#ifndef TREEIO_H_
+#define TREEIO_H_
+
+#include "pll.h"
+
+char *pllTreeToNewick(char *treestr, tree *tr, nodeptr p, pllBoolean printBranchLengths, pllBoolean printNames, pllBoolean printLikelihood,
+		  pllBoolean rellTree, pllBoolean finalPrint, int perGene, pllBoolean branchLabelSupport, pllBoolean printSHSupport);
+double getBranchLength(pllInstance *tr, partitionList *pr, int perGene, nodeptr p);
+
+#endif /* TREEIO_H_ */
diff --git a/pll/utils.c b/pll/utils.c
new file mode 100644
index 0000000..02c49f1
--- /dev/null
+++ b/pll/utils.c
@@ -0,0 +1,3735 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file utils.c
+ *  
+ * @brief Miscellaneous general utility and helper functions
+ */
+#ifdef WIN32
+#include <direct.h>
+#endif
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdarg.h>
+#include <limits.h>
+#include <assert.h>
+#include <errno.h>
+#include "cycle.h"
+
+
+#if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC))
+#if (defined(__AVX) || defined(__SSE3))
+#include <xmmintrin.h>
+#endif
+/*
+   special bug fix, enforces denormalized numbers to be flushed to zero,
+   without this program is a tiny bit faster though.
+#include <emmintrin.h> 
+#define MM_DAZ_MASK    0x0040
+#define MM_DAZ_ON    0x0040
+#define MM_DAZ_OFF    0x0000
+*/
+#endif
+
+#include "pll.h"
+#include "pllInternal.h"
+
+#define GLOBAL_VARIABLES_DEFINITION
+
+#include "globalVariables.h"
+
+/* mappings of BIN/DNA/AA alphabet to numbers */
+
+static const char PLL_MAP_BIN[256] =
+ {
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  3, -1, -1,
+    1,  2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  3,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+  };
+
+static const char PLL_MAP_NT[256] =
+ {
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15,
+   -1,  1, 14,  2, 13, -1, -1,  4, 11, -1, -1, 12, -1,  3, 15, 15,
+   -1, -1,  5,  6,  8,  8,  7,  9, 15, 10, -1, -1, -1, -1, -1, -1,
+   -1,  1, 14,  2, 13, -1, -1,  4, 11, -1, -1, 12, -1,  3, 15, 15,
+   -1, -1,  5,  6,  8,  8,  7,  9, 15, 10, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+ };
+
+static const char PLL_MAP_AA[256] =
+ {
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, 22, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22,
+   -1,  0, 20,  4,  3,  6, 13,  7,  8,  9, -1, 11, 10, 12,  2, -1,
+   14,  5,  1, 15, 16, -1, 19, 17, 22, 18, 21, -1, -1, -1, -1, -1,
+   -1,  0, 20,  4,  3,  6, 13,  7,  8,  9, -1, 11, 10, 12,  2, -1,
+   14,  5,  1, 15, 16, -1, 19, 17, 22, 18, 21, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+ };
+
+
+
+
+
+static void pllTreeInitDefaults (pllInstance * tr, int tips);
+static void getInnerBranchEndPointsRecursive (nodeptr p, int tips, int * i, node **nodes);
+#if (!defined(_FINE_GRAIN_MPI) && !defined(_USE_PTHREADS))
+static void initializePartitionsSequential(pllInstance *tr, partitionList *pr);
+#endif
+
+/** @defgroup instanceLinkingGroup Linking topology, partition scheme and alignment to the PLL instance
+    
+    This set of functions handles the linking of topology, partition scheme and multiple sequence alignment
+    with the PLL instance
+*/
+/***************** UTILITY FUNCTIONS **************************/
+
+#if (!defined(_SVID_SOURCE) && !defined(_BSD_SOURCE) && !defined(_POSIX_C_SOURCE) && !defined(_XOPEN_SOURCE) && !defined(_POSIX_SOURCE))
+static char *
+my_strtok_r (char * s, const char * delim, char **save_ptr)
+{  
+  char *token;
+   
+  /* Scan leading delimiters */
+  if (s == NULL)
+    s = *save_ptr;
+   
+  s += strspn (s, delim);
+  if (*s == '\0')
+   {
+     *save_ptr = s;
+     return NULL;
+   }
+   
+  /* Find the end of the token. */
+  token = s;
+  s = strpbrk (token, delim);
+  if (!s)
+    *save_ptr = strchr (token, '\0');
+  else
+   {
+     /* Terminate the token and make *SAVE_PTR point past it */
+     *s = '\0';
+     *save_ptr = s + 1;
+   }
+   
+  return token;
+}
+#endif
+
+#if (defined(_SVID_SOURCE) || defined(_BSD_SOURCE) || defined(_POSIX_C_SOURCE) || defined(_XOPEN_SOURCE) || defined(_POSIX_SOURCE))
+#define STRTOK_R strtok_r
+#else
+#define STRTOK_R my_strtok_r
+#endif
+
+
+
+
+void storeExecuteMaskInTraversalDescriptor(pllInstance *tr, partitionList *pr)
+{
+  int model;
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    tr->td[0].executeModel[model] = pr->partitionData[model]->executeModel;
+
+}
+
+void storeValuesInTraversalDescriptor(pllInstance *tr, partitionList *pr, double *value)
+{
+  int model;
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    tr->td[0].parameterValues[model] = value[model];
+}
+
+const unsigned int *getBitVector(int dataType)
+{
+  assert(PLL_MIN_MODEL < dataType && dataType < PLL_MAX_MODEL);
+
+  return pLengths[dataType].bitVector;
+}
+
+/*
+int getStates(int dataType)
+{
+  assert(PLL_MIN_MODEL < dataType && dataType < PLL_MAX_MODEL);
+
+  return pLengths[dataType].states;
+}
+*/
+
+int getUndetermined(int dataType)
+{
+  assert(PLL_MIN_MODEL < dataType && dataType < PLL_MAX_MODEL);
+
+  return pLengths[dataType].undetermined;
+}
+
+const partitionLengths *getPartitionLengths(pInfo *p)
+{
+  int 
+    dataType  = p->dataType,
+              states    = p->states,
+              tipLength = p->maxTipStates;
+
+  assert(states != -1 && tipLength != -1);
+
+  assert(PLL_MIN_MODEL < dataType && dataType < PLL_MAX_MODEL);
+
+  /*pLength.leftLength = pLength.rightLength = states * states;
+    pLength.eignLength = states;
+    pLength.evLength   = states * states;
+    pLength.eiLength   = states * states;
+    pLength.substRatesLength = (states * states - states) / 2;
+    pLength.frequenciesLength = states;
+    pLength.tipVectorLength   = tipLength * states;
+    pLength.symmetryVectorLength = (states * states - states) / 2;
+    pLength.frequencyGroupingLength = states;
+    pLength.nonGTR = PLL_FALSE;*/
+  return (&pLengths[dataType]); 
+}
+
+size_t discreteRateCategories(int rateHetModel)
+{
+  size_t 
+    result;
+
+  switch(rateHetModel)
+  {
+    case PLL_CAT:
+      result = 1;
+      break;
+    case PLL_GAMMA:
+      result = 4;
+      break;
+    default:
+      assert(0);
+  }
+
+  return result;
+}
+
+
+
+double gettime(void)
+{
+#ifdef WIN32
+  time_t tp;
+  struct tm localtm;
+  tp = time(NULL);
+  localtm = *localtime(&tp);
+  return 60.0*localtm.tm_min + localtm.tm_sec;
+#else
+  struct timeval ttime;
+  gettimeofday(&ttime , NULL);
+  return ttime.tv_sec + ttime.tv_usec * 0.000001;
+#endif
+}
+
+int gettimeSrand(void)
+{
+#ifdef WIN32
+  time_t tp;
+  struct tm localtm;
+  tp = time(NULL);
+  localtm = *localtime(&tp);
+  return 24*60*60*localtm.tm_yday + 60*60*localtm.tm_hour + 60*localtm.tm_min  + localtm.tm_sec;
+#else
+  struct timeval ttime;
+  gettimeofday(&ttime , NULL);
+  return ttime.tv_sec + ttime.tv_usec;
+#endif
+}
+
+double randum (long  *seed)
+{
+  long  sum, mult0, mult1, seed0, seed1, seed2, newseed0, newseed1, newseed2;
+  double res;
+
+  mult0 = 1549;
+  seed0 = *seed & 4095;
+  sum  = mult0 * seed0;
+  newseed0 = sum & 4095;
+  sum >>= 12;
+  seed1 = (*seed >> 12) & 4095;
+  mult1 =  406;
+  sum += mult0 * seed1 + mult1 * seed0;
+  newseed1 = sum & 4095;
+  sum >>= 12;
+  seed2 = (*seed >> 24) & 255;
+  sum += mult0 * seed2 + mult1 * seed1;
+  newseed2 = sum & 255;
+
+  *seed = newseed2 << 24 | newseed1 << 12 | newseed0;
+  res = 0.00390625 * (newseed2 + 0.000244140625 * (newseed1 + 0.000244140625 * newseed0));
+
+  return res;
+}
+
+
+/********************* END UTILITY FUNCTIONS ********************/
+
+
+/******************************some functions for the likelihood computation ****************************/
+
+
+/** @brief Check whether a node is a tip.
+    
+    Checks whether the node with number \a number is a tip.
+    
+    @param number
+     Node number to be checked
+   
+    @param maxTips
+     Number of tips in the tree
+   
+    @return
+      \b PLL_TRUE if tip, \b PLL_FALSE otherwise
+  */
+pllBoolean isTip(int number, int maxTips)
+{
+  assert(number > 0);
+
+  if(number <= maxTips)
+    return PLL_TRUE;
+  else
+    return PLL_FALSE;
+}
+
+/** @brief Set the orientation of a node
+
+    Sets the orientation of node \a p. That means, it will reset the orientation
+    \a p->next->x and \a p->next->next->x to 0 and of \a p->x to 1, meaning that
+    the conditional likelihood vector for that node is oriented on \a p, i.e.
+    the conditional likelihood vector represents the subtree rooted at \a p and
+    not any other of the two nodes.
+
+    @param p
+      Node which we want to orient
+*/
+void getxnode (nodeptr p)
+{
+  nodeptr  s;
+
+  if ((s = p->next)->x || (s = s->next)->x)
+  {
+    p->x = s->x;
+    s->x = 0;
+  }
+
+  assert(p->x);
+}
+
+
+/** @brief Connect two nodes and assign branch lengths 
+  * 
+  * Connect the two nodes \a p and \a q in each partition \e i with a branch of
+  * length \a z[i]
+  *
+  * @param p
+  *   Node \a p
+  * 
+  * @param q
+  *   Node \a q
+  *
+  * @param numBranches
+  *   Number of partitions
+  */
+void hookup (nodeptr p, nodeptr q, double *z, int numBranches)
+{
+  int i;
+
+  p->back = q;
+  q->back = p;
+
+  for(i = 0; i < numBranches; i++)
+    p->z[i] = q->z[i] = z[i];
+}
+
+/* connects node p with q and assigns the branch lengths z for the whole vector*/
+void hookupFull (nodeptr p, nodeptr q, double *z)
+{
+  //int i;
+
+  p->back = q;
+  q->back = p;
+
+  memcpy(p->z, z, PLL_NUM_BRANCHES*sizeof(double) );
+  memcpy(q->z, z, PLL_NUM_BRANCHES*sizeof(double) );
+  //for(i = 0; i < numBranches; i++)
+  //  p->z[i] = q->z[i] = z[i];
+
+}
+
+/* connect node p with q and assign the default branch lengths */
+void hookupDefault (nodeptr p, nodeptr q)
+{
+  int i;
+
+  p->back = q;
+  q->back = p;
+
+// TODO: fix: this make parsimony tree computation very slow with increasing PLL_NUM_BRANCHES
+//  for(i = 0; i < PLL_NUM_BRANCHES; i++)
+//    p->z[i] = q->z[i] = PLL_DEFAULTZ;
+    p->z[0] = q->z[0] = PLL_DEFAULTZ;
+}
+
+
+/***********************reading and initializing input ******************/
+
+
+
+pllBoolean whitechar (int ch)
+{
+  return (ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r');
+}
+/*
+static unsigned int KISS32(void)
+{
+  static unsigned int 
+    x = 123456789, 
+      y = 362436069,
+      z = 21288629,
+      w = 14921776,
+      c = 0;
+
+  unsigned int t;
+
+  x += 545925293;
+  y ^= (y<<13); 
+  y ^= (y>>17); 
+  y ^= (y<<5);
+  t = z + w + c; 
+  z = w; 
+  c = (t>>31); 
+  w = t & 2147483647;
+
+  return (x+y+w);
+}
+*/
+
+/** @brief Get a random subtree
+
+    Returns the root node of a randomly picked subtree of the tree in PLL
+    instance \a tr. The picked subtree is guaranteed to have height over
+    1, that is, the direct descendents of the returned (root) node are not tips.
+
+    @param tr
+      PLL instance
+
+    @return
+      The root node of the randomly picked subtree
+*/
+nodeptr pllGetRandomSubtree(pllInstance *tr)
+{
+  nodeptr p;
+  do
+  {
+    int exitDirection = rand() % 3; 
+    p = tr->nodep[(rand() % (tr->mxtips - 2)) + 1 + tr->mxtips];
+    switch(exitDirection)
+    {
+      case 0:
+        break;
+      case 1:
+        p = p->next;
+        break;
+      case 2:
+        p = p->next->next;
+        break;
+      default:
+        assert(0);
+    }
+  }
+  while(isTip(p->next->back->number, tr->mxtips) && isTip(p->next->next->back->number, tr->mxtips));
+  assert(!isTip(p->number, tr->mxtips));
+  return p;
+}
+/* small example program that executes ancestral state computations 
+   on the entire subtree rooted at p.
+
+   Note that this is a post-order traversal.
+*/
+
+  
+void computeAllAncestralVectors(nodeptr p, pllInstance *tr, partitionList *pr)
+{
+  /* if this is not a tip, for which evidently it does not make sense 
+     to compute the ancestral sequence because we have the real one ....
+  */
+
+  if(!isTip(p->number, tr->mxtips))
+    {
+      /* descend recursively to compute the ancestral states in the left and right subtrees */
+
+      computeAllAncestralVectors(p->next->back, tr, pr);
+      computeAllAncestralVectors(p->next->next->back, tr, pr);
+      
+      /* then compute the ancestral state at node p */
+
+      pllUpdatePartialsAncestral(tr, pr, p);
+
+      /* and print it to terminal, the two booleans that are set to PLL_TRUE here 
+         tell the function to print the marginal probabilities as well as 
+         a discrete inner sequence, that is, ACGT etc., always selecting and printing 
+         the state that has the highest probability */
+
+      printAncestralState(p, PLL_TRUE, PLL_TRUE, tr, pr);
+    }
+}
+
+
+
+void initializePartitionData(pllInstance *localTree, partitionList * localPartitions)
+{
+  /* in ancestralVectorWidth we store the total length in bytes (!) of 
+     one conditional likelihood array !
+     we need to know this length such that in the pthreads version the master thread can actually 
+     gather the scattered ancestral probabilities from the threads such that they can be printed to screen!
+  */
+
+  size_t 
+    maxCategories = (size_t)localTree->maxCategories;
+
+  size_t 
+    ancestralVectorWidth = 0,
+    model; 
+
+  int 
+    tid  = localTree->threadID,
+    innerNodes = localTree->mxtips - 2;
+
+  if(tid > 0)
+      localTree->rateCategory    = (int *)    rax_calloc((size_t)localTree->originalCrunchedLength, sizeof(int));           
+
+  for(model = 0; model < (size_t)localPartitions->numberOfPartitions; model++)
+    {
+      size_t 
+        width = localPartitions->partitionData[model]->width;
+
+      const partitionLengths 
+        *pl = getPartitionLengths(localPartitions->partitionData[model]);
+
+      /* 
+         globalScaler needs to be 2 * localTree->mxtips such that scalers of inner AND tip nodes can be added without a case switch
+         to this end, it must also be initialized with zeros -> calloc
+      */
+
+      localPartitions->partitionData[model]->globalScaler    = (unsigned int *)rax_calloc(2 *(size_t)localTree->mxtips, sizeof(unsigned int));
+
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->left),  PLL_BYTE_ALIGNMENT, (size_t)pl->leftLength * (maxCategories + 1) * sizeof(double));
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->right), PLL_BYTE_ALIGNMENT, (size_t)pl->rightLength * (maxCategories + 1) * sizeof(double));
+      localPartitions->partitionData[model]->EIGN              = (double*)rax_malloc((size_t)pl->eignLength * sizeof(double));
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->EV),    PLL_BYTE_ALIGNMENT, (size_t)pl->evLength * sizeof(double));
+      localPartitions->partitionData[model]->EI                = (double*)rax_malloc((size_t)pl->eiLength * sizeof(double));
+      localPartitions->partitionData[model]->substRates        = (double *)rax_malloc((size_t)pl->substRatesLength * sizeof(double));
+      localPartitions->partitionData[model]->frequencies       = (double*)rax_malloc((size_t)pl->frequenciesLength * sizeof(double));
+      localPartitions->partitionData[model]->freqExponents     = (double*)rax_malloc(pl->frequenciesLength * sizeof(double));
+      localPartitions->partitionData[model]->empiricalFrequencies       = (double*)rax_malloc((size_t)pl->frequenciesLength * sizeof(double));
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->tipVector), PLL_BYTE_ALIGNMENT, (size_t)pl->tipVectorLength * sizeof(double));
+      //localPartitions->partitionData[model]->partitionName      = NULL;   // very imporatant since it is deallocated in pllPartitionDestroy
+      
+       if(localPartitions->partitionData[model]->dataType == PLL_AA_DATA
+               && (localPartitions->partitionData[model]->protModels == PLL_LG4M || localPartitions->partitionData[model]->protModels == PLL_LG4X))
+        {
+          int 
+            k;
+          
+          for(k = 0; k < 4; k++)
+            {       
+              localPartitions->partitionData[model]->EIGN_LG4[k]              = (double*)rax_malloc(pl->eignLength * sizeof(double));
+              rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->EV_LG4[k]), PLL_BYTE_ALIGNMENT, pl->evLength * sizeof(double));
+              localPartitions->partitionData[model]->EI_LG4[k]                = (double*)rax_malloc(pl->eiLength * sizeof(double));
+              localPartitions->partitionData[model]->substRates_LG4[k]        = (double *)rax_malloc(pl->substRatesLength * sizeof(double));
+              localPartitions->partitionData[model]->frequencies_LG4[k]       = (double*)rax_malloc(pl->frequenciesLength * sizeof(double));
+              rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->tipVector_LG4[k]), PLL_BYTE_ALIGNMENT, pl->tipVectorLength * sizeof(double));
+            }
+        }
+
+      localPartitions->partitionData[model]->symmetryVector    = (int *)rax_malloc((size_t)pl->symmetryVectorLength  * sizeof(int));
+      localPartitions->partitionData[model]->frequencyGrouping = (int *)rax_malloc((size_t)pl->frequencyGroupingLength  * sizeof(int));
+
+      localPartitions->partitionData[model]->perSiteRates      = (double *)rax_malloc(sizeof(double) * maxCategories);
+
+      localPartitions->partitionData[model]->nonGTR = PLL_FALSE;
+
+      localPartitions->partitionData[model]->gammaRates = (double*)rax_malloc(sizeof(double) * 4);
+      localPartitions->partitionData[model]->yVector = (unsigned char **)rax_malloc(sizeof(unsigned char*) * ((size_t)localTree->mxtips + 1));
+
+
+      localPartitions->partitionData[model]->xVector = (double **)rax_calloc(sizeof(double*), (size_t)localTree->mxtips);
+
+      if (localPartitions->partitionData[model]->ascBias)
+       {
+         localPartitions->partitionData[model]->ascOffset    = 4 * localPartitions->partitionData[model]->states * localPartitions->partitionData[model]->states;
+         localPartitions->partitionData[model]->ascVector    = (double *)rax_malloc(innerNodes * 
+                                                                                    localPartitions->partitionData[model]->ascOffset * 
+                                                                                    sizeof(double));
+         localPartitions->partitionData[model]->ascExpVector = (int *)rax_calloc(innerNodes *
+                                                                                 localPartitions->partitionData[model]->states,
+                                                                                 sizeof(int));
+         localPartitions->partitionData[model]->ascSumBuffer = (double *)rax_malloc(localPartitions->partitionData[model]->ascOffset * sizeof(double)); 
+       }
+
+
+      /* 
+         Initializing the xVector array like this is absolutely required !!!!
+         I don't know which programming genious removed this, but it must absolutely stay in here!!!!
+      */
+      
+      {
+        int k;
+        
+        for(k = 0; k < localTree->mxtips; k++)
+              localPartitions->partitionData[model]->xVector[k] = (double*)NULL;       
+      }
+
+
+      localPartitions->partitionData[model]->xSpaceVector = (size_t *)rax_calloc((size_t)localTree->mxtips, sizeof(size_t));
+
+      const size_t span = (size_t)(localPartitions->partitionData[model]->states) *
+              discreteRateCategories(localTree->rateHetModel);
+
+#ifdef __MIC_NATIVE
+
+      // Alexey: sum buffer buffer padding for Xeon PHI
+      const int aligned_width = width % PLL_VECTOR_WIDTH == 0 ? width : width + (PLL_VECTOR_WIDTH - (width % PLL_VECTOR_WIDTH));
+
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->sumBuffer), PLL_BYTE_ALIGNMENT, aligned_width *
+                                                                                      span *
+                                                                                      sizeof(double));
+
+      // Alexey: fill padding entries with 1. (will be corrected with site weights, s. below)
+      {
+          int k;
+          for (k = width*span; k < aligned_width*span; ++k)
+              localPartitions->partitionData[model]->sumBuffer[k] = 1.;
+      }
+
+#else
+
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->sumBuffer), PLL_BYTE_ALIGNMENT, width *
+                                              span *
+                                              sizeof(double));
+#endif
+
+      /* Initialize buffers to store per-site log likelihoods */
+
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->perSiteLikelihoods), PLL_BYTE_ALIGNMENT, width * sizeof(double));
+
+      /* initialize data structures for per-site likelihood scaling */
+
+      if(localTree->fastScaling)
+        {
+           localPartitions->partitionData[model]->expVector      = (int **)NULL;
+           localPartitions->partitionData[model]->expSpaceVector = (size_t *)NULL;
+        }
+      else
+        {        
+          localPartitions->partitionData[model]->expVector      = (int **)rax_malloc(sizeof(int*) * innerNodes);
+           
+          /* 
+             Initializing the expVector array like this is absolutely required !!!!
+             Not doing this can (and did) cause segmentation faults !!!!
+          */
+          
+          {
+            int k;
+
+            for(k = 0; k < innerNodes; k++)
+              localPartitions->partitionData[model]->expVector[k] = (int*)NULL; 
+          }
+
+          localPartitions->partitionData[model]->expSpaceVector = (size_t *)rax_calloc(innerNodes, sizeof(size_t));
+        }
+
+      /* data structure to store the marginal ancestral probabilities in the sequential version or for each thread */
+
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->ancestralBuffer), PLL_BYTE_ALIGNMENT, width *
+                                                                                 (size_t)(localPartitions->partitionData[model]->states) *
+                                                                                 sizeof(double));
+
+      /* count and accumulate how many bytes we will need for storing a full ancestral vector. for this we addf over the per-partition space requirements in bytes */
+      /* ancestralVectorWidth += ((size_t)(pr->partitionData[model]->upper - pr->partitionData[model]->lower) * (size_t)(localPartitions->partitionData[model]->states) * sizeof(double)); */
+      ancestralVectorWidth += ((size_t)(localPartitions->partitionData[model]->upper - localPartitions->partitionData[model]->lower) * (size_t)(localPartitions->partitionData[model]->states) * sizeof(double));
+      /* :TODO: do we have to use the original tree for that   */
+
+#ifdef __MIC_NATIVE
+
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->wgt), PLL_BYTE_ALIGNMENT, aligned_width * sizeof(int));
+
+      // Alexey: fill padding entries with 0.
+      {
+          int k;
+          for (k = width; k < aligned_width; ++k)
+              localPartitions->partitionData[model]->wgt[k] = 0;
+      }
+#else
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->wgt), PLL_BYTE_ALIGNMENT, width * sizeof(int));
+#endif
+
+      /* rateCategory must be assigned using rax_calloc() at start up there is only one rate category 0 for all sites */
+
+      localPartitions->partitionData[model]->rateCategory = (int *)rax_calloc(width, sizeof(int));
+
+      if(width > 0 && localTree->saveMemory)
+        {
+          localPartitions->partitionData[model]->gapVectorLength = ((int)width / 32) + 1;
+          assert(4 == sizeof(unsigned int));
+          localPartitions->partitionData[model]->gapVector = (unsigned int*)rax_calloc((size_t)localPartitions->partitionData[model]->gapVectorLength * 2 * (size_t)localTree->mxtips, sizeof(unsigned int));
+          rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->gapColumn),PLL_BYTE_ALIGNMENT, ((size_t)localTree->mxtips) *
+                                                                               ((size_t)(localPartitions->partitionData[model]->states)) *
+                                                                               discreteRateCategories(localTree->rateHetModel) * sizeof(double));
+        }
+      else
+        {
+          localPartitions->partitionData[model]->gapVectorLength = 0;
+          localPartitions->partitionData[model]->gapVector = (unsigned int*)NULL;
+          localPartitions->partitionData[model]->gapColumn = (double*)NULL;
+        }              
+    }
+}
+
+int virtual_width( int n ) {
+    const int global_vw = 2;
+    return (n+1) / global_vw * global_vw;
+}
+
+
+void initMemorySavingAndRecom(pllInstance *tr, partitionList *pr)
+{
+  pllInstance  
+    *localTree = tr; 
+  partitionList
+    *localPartitions = pr;
+  size_t model; 
+
+  /* initialize gap bit vectors at tips when memory saving option is enabled */
+
+  if(localTree->saveMemory)
+    {
+      for(model = 0; model < (size_t)localPartitions->numberOfPartitions; model++)
+        {
+          int        
+            undetermined = getUndetermined(localPartitions->partitionData[model]->dataType);
+
+          size_t
+            i,
+            j,
+            width =  localPartitions->partitionData[model]->width;
+
+          if(width > 0)
+            {                                        
+              for(j = 1; j <= (size_t)(localTree->mxtips); j++)
+                for(i = 0; i < width; i++)
+                  if(localPartitions->partitionData[model]->yVector[j][i] == undetermined)
+                    localPartitions->partitionData[model]->gapVector[localPartitions->partitionData[model]->gapVectorLength * j + i / 32] |= mask32[i % 32];
+            }     
+        }
+    }
+  /* recom */
+  if(localTree->useRecom)
+    allocRecompVectorsInfo(localTree);
+  else
+    localTree->rvec = (recompVectors*)NULL;
+  /* E recom */
+}
+
+/** @brief Get the length of a specific branch
+
+    Get the length of the branch specified by node \a p and \a p->back
+    of partition \a partition_id.
+    The branch length is decoded from the PLL representation.
+
+    @param tr
+      PLL instance
+
+    @param p
+      Specifies one end-point of the branch. The other one is \a p->back
+
+    @param partition_id
+      Specifies the partition
+
+    @return
+      The branch length
+*/
+double pllGetBranchLength (pllInstance *tr, nodeptr p, int partition_id)
+{
+  //assert(partition_id < tr->numBranches);
+  assert(partition_id < PLL_NUM_BRANCHES);
+  assert(partition_id >= 0);
+  assert(tr->fracchange != -1.0);
+  double z = p->z[partition_id];
+  if(z < PLL_ZMIN) z = PLL_ZMIN;
+  if(z > PLL_ZMAX) z = PLL_ZMAX;
+  return (-log(z) * tr->fracchange);
+}
+
+/** @brief Set the length of a specific branch
+
+    Set the length of the branch specified by node \a p and \a p->back
+    of partition \a partition_id.
+    The function encodes the branch length to the PLL representation.
+
+    @param tr
+      PLL instance
+
+    @param p
+      Specifies one end-point of the branch. The other one is \a p->back
+
+    @param partition_id
+      Specifies the partition
+
+    @param bl
+      Branch length
+*/
+void pllSetBranchLength (pllInstance *tr, nodeptr p, int partition_id, double bl)
+{
+  //assert(partition_id < tr->numBranches);
+  assert(partition_id < PLL_NUM_BRANCHES);
+  assert(partition_id >= 0);
+  assert(tr->fracchange != -1.0);
+  double z;
+  z = exp((-1 * bl)/tr->fracchange);
+  if(z < PLL_ZMIN) z = PLL_ZMIN;
+  if(z > PLL_ZMAX) z = PLL_ZMAX;
+  p->z[partition_id] = z;
+}
+
+#if (!defined(_FINE_GRAIN_MPI) && !defined(_USE_PTHREADS))
+static void initializePartitionsSequential(pllInstance *tr, partitionList *pr)
+{ 
+  size_t
+    model;
+
+  for(model = 0; model < (size_t)pr->numberOfPartitions; model++)
+    assert(pr->partitionData[model]->width == pr->partitionData[model]->upper - pr->partitionData[model]->lower);
+
+  initializePartitionData(tr, pr);
+
+  /* figure in tip sequence data per-site pattern weights */ 
+  for(model = 0; model < (size_t)pr->numberOfPartitions; model++)
+  {
+    size_t
+      j;
+    size_t lower = pr->partitionData[model]->lower;
+    size_t width = pr->partitionData[model]->upper - lower;
+
+    for(j = 1; j <= (size_t)tr->mxtips; j++)
+    {
+      pr->partitionData[model]->yVector[j] = &(tr->yVector[j][pr->partitionData[model]->lower]);
+    }
+
+    memcpy((void*)(&(pr->partitionData[model]->wgt[0])),         (void*)(&(tr->aliaswgt[lower])),      sizeof(int) * width);
+  }  
+
+  initMemorySavingAndRecom(tr, pr);
+}
+#endif
+
+
+/* interface to outside  */
+//void initializePartitions(pllInstance *tr, pllInstance *localTree, partitionList *pr, partitionList *localPr, int tid, int n)
+//{
+//#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+//  initializePartitionsMaster(tr,localTree,pr,localPr,tid,n);
+//#else
+//  initializePartitionsSequential(tr, pr);
+//#endif
+//}
+
+static void freeLinkageList( linkageList* ll)
+{
+  int i;    
+
+  for(i = 0; i < ll->entries; i++)    
+    rax_free(ll->ld[i].partitionList);         
+
+  rax_free(ll->ld);
+  rax_free(ll);   
+}
+
+/** @brief free all data structures associated to a partition
+    
+    frees all data structures allocated for this partition
+
+    @param partitions
+      the pointer to the partition list
+
+    @param tips  
+       number of tips in the tree      
+*/
+void 
+pllPartitionsDestroy (pllInstance * tr, partitionList ** partitions)
+{
+  int i, j, tips;
+  partitionList * pl = *partitions;
+
+#ifdef _USE_PTHREADS
+  int tid = tr->threadID;
+  if (MASTER_P) {
+     pllMasterBarrier (tr, pl, PLL_THREAD_EXIT_GRACEFULLY);
+     pllStopPthreads (tr);
+    }
+#endif
+
+  tips = tr->mxtips;
+
+#ifdef _USE_PTHREADS
+  if (MASTER_P) {
+#endif
+#ifdef _FINE_GRAIN_MPI
+if (MASTER_P) {
+     pllMasterBarrier (tr, pl, PLL_THREAD_EXIT_GRACEFULLY);
+#endif
+  freeLinkageList(pl->alphaList);
+  freeLinkageList(pl->freqList); 
+  freeLinkageList(pl->rateList);
+#ifdef _FINE_GRAIN_MPI
+}
+#endif
+
+#ifdef _USE_PTHREADS
+  }
+#endif
+  for (i = 0; i < pl->numberOfPartitions; ++ i)
+   {
+     rax_free (pl->partitionData[i]->gammaRates);
+     rax_free (pl->partitionData[i]->perSiteRates);
+     rax_free (pl->partitionData[i]->globalScaler);
+     rax_free (pl->partitionData[i]->left);
+     rax_free (pl->partitionData[i]->right);
+     rax_free (pl->partitionData[i]->EIGN);
+     rax_free (pl->partitionData[i]->EV);
+     rax_free (pl->partitionData[i]->EI);
+     rax_free (pl->partitionData[i]->substRates);
+     rax_free (pl->partitionData[i]->frequencies);
+     rax_free (pl->partitionData[i]->freqExponents);
+     rax_free (pl->partitionData[i]->empiricalFrequencies);
+     rax_free (pl->partitionData[i]->tipVector);
+     rax_free (pl->partitionData[i]->symmetryVector);
+     rax_free (pl->partitionData[i]->frequencyGrouping);
+     for (j = 0; j < tips; ++ j)
+       rax_free (pl->partitionData[i]->xVector[j]);
+     rax_free (pl->partitionData[i]->xVector);
+     rax_free (pl->partitionData[i]->yVector);
+     rax_free (pl->partitionData[i]->xSpaceVector);
+     rax_free (pl->partitionData[i]->sumBuffer);
+     rax_free (pl->partitionData[i]->ancestralBuffer);
+     rax_free (pl->partitionData[i]->wgt);
+     rax_free (pl->partitionData[i]->rateCategory);
+     rax_free (pl->partitionData[i]->gapVector);
+     rax_free (pl->partitionData[i]->gapColumn);
+     rax_free (pl->partitionData[i]->perSiteLikelihoods);
+     rax_free (pl->partitionData[i]->partitionName);
+     rax_free (pl->partitionData[i]->expSpaceVector);
+     /*TODO: Deallocate all entries of expVector */
+     if (pl->partitionData[i]->expVector)
+      {
+        for (j = 0; j < tips - 2; ++ j)
+          rax_free (pl->partitionData[i]->expVector[j]);
+      }
+     rax_free (pl->partitionData[i]->expVector);
+     rax_free (pl->partitionData[i]);
+   }
+  rax_free (pl->partitionData);
+  rax_free (pl);
+
+  *partitions = NULL;
+
+#if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI))
+     rax_free (tr->y_ptr);
+#endif
+}
+
+/** @ingroup instanceLinkingGroup
+    @brief Correspondance check between partitions and alignment
+
+    This function checks whether the partitions to be created and the given
+    alignment correspond, that is, whether each site of the alignment is
+    assigned to exactly one partition.
+
+    @param parts
+      A list of partitions suggested by the caller
+
+    @param alignmentData
+      The multiple sequence alignment
+    
+    @return
+      Returns \a 1 in case of success, otherwise \a 0
+*/
+int
+pllPartitionsValidate (pllQueue * parts, pllAlignmentData * alignmentData)
+{
+  int nparts;
+  char * used;
+  struct pllQueueItem * elm;
+  struct pllQueueItem * regionItem;
+  pllPartitionRegion * region;
+  pllPartitionInfo * pi;
+  int i;
+
+  /* check if the list contains at least one partition */
+  nparts = pllQueueSize (parts);
+  if (!nparts)          
+    return (0);   
+
+  /* pllBoolean array for marking that a site was assigned a partition */
+  used = (char *) rax_calloc (alignmentData->sequenceLength, sizeof (char));
+
+  /* traverse all partitions and their respective regions and mark sites */
+  for (elm = parts->head; elm; elm = elm->next)
+   {
+     pi = (pllPartitionInfo *) elm->item;
+     
+     for (regionItem = pi->regionList->head; regionItem; regionItem = regionItem->next)
+      {
+        region = (pllPartitionRegion *) regionItem->item;
+        
+        if (region->start < 1 || region->end > alignmentData->sequenceLength) 
+         {
+           rax_free (used);
+           return (0);
+         }
+
+        for (i = region->start - 1; i < region->end; i += region->stride)
+         {
+           if (used[i])
+            {
+              rax_free (used);
+              return (0);
+            }
+           used[i] = 1; 
+         }
+      }
+   }
+
+  /* check whether all sites were assigned a partition */
+  for (i = 0; i < alignmentData->sequenceLength; ++ i)
+    if (used[i] != 1)
+     {
+       rax_free (used);
+       return (0);
+     }
+
+  rax_free (used);
+  return (1);
+}
+
+/** @brief Swap two sites in a buffer
+    
+    Swaps sites \a s1 and \a s2 in buffer \a buf which consists of \a nTaxa + 1
+    taxa (i.e. rows), and the first row contains no information, i.e. it is not
+    accessed.
+
+    @param buffer
+      Memory buffer
+
+    @param s1
+      First site
+
+    @param s2
+      Second site
+
+    @param nTaxa
+      Number of taxa, i.e. size of site
+*/
+static __inline void
+swapSite (unsigned char ** buf, int s1, int s2, int nTaxa)
+{
+  int i;
+  int x;
+
+  for (i = 1; i <= nTaxa; ++ i)
+  {
+    x = buf[i][s1];
+    buf[i][s1] = buf[i][s2];
+    buf[i][s2] = x;
+  }
+}
+
+/** @brief Constructs the list of partitions according to the proposed partition scheme
+    
+    A static function that construcs the \a partitionList structure according to
+    the partition scheme \b AFTER the sites have been repositioned in contiguous
+    regions according to the partition scheme.
+
+    @param bounds  An array of the new starting and ending posititons of sites
+    in the alignment for each partition.  This array is of size 2 * \a nparts.
+    The elements are always couples (lower,upper). The upper bounds is a site
+    that is not included in the partition
+
+    @param nparts The number of partitions to be created
+
+    @todo Fix the bug in PLL 
+*/
+static partitionList * createPartitions (pllQueue * parts, int * bounds)
+{
+  partitionList * pl;
+  pllPartitionInfo * pi;
+  struct pllQueueItem * elm;
+  int i, j;
+
+  pl = (partitionList *) rax_malloc (sizeof (partitionList));
+  
+  // TODO: fix this
+  pl->perGeneBranchLengths =      0;
+
+  // TODO: change PLL_NUM_BRANCHES to number of partitions I guess
+  pl->partitionData = (pInfo **) rax_calloc (PLL_NUM_BRANCHES, sizeof (pInfo *));
+  
+  for (i = 0, elm = parts->head; elm; elm = elm->next, ++ i)
+   {
+     pi = (pllPartitionInfo *) elm->item;
+
+     /* check whether the data type is valid, and in case it's not, deallocate
+        and return NULL */
+     if (pi->dataType <= PLL_MIN_MODEL || pi->dataType >= PLL_MAX_MODEL)
+      {
+        for (j = 0; j < i; ++ j)
+         {
+           rax_free (pl->partitionData[j]->partitionName);
+           rax_free (pl->partitionData[j]);
+         }
+        rax_free (pl->partitionData);
+        rax_free (pl);
+        return (NULL);
+      }
+
+     pl->partitionData[i] = (pInfo *) rax_malloc (sizeof (pInfo));
+
+     pl->partitionData[i]->lower = bounds[i << 1];
+     pl->partitionData[i]->upper = bounds[(i << 1) + 1];
+     pl->partitionData[i]->width = bounds[(i << 1) + 1] - bounds[i << 1];
+     pl->partitionData[i]->partitionWeight = 1.0 * (double) pl->partitionData[i]->width;
+
+     //the two flags below are required to allow users to set 
+     //alpha parameters and substitution rates in the Q matrix 
+     //to fixed values. These parameters will then not be optimized 
+     //in the model parameter optimization functions
+     //by default we assume that all parameters are being optimized, i.e., 
+     //this has to be explicitly set by the user 
+     
+     pl->partitionData[i]->optimizeAlphaParameter    = PLL_TRUE;
+     pl->partitionData[i]->optimizeSubstitutionRates = PLL_TRUE;
+     pl->partitionData[i]->dataType                  = pi->dataType;
+     pl->partitionData[i]->protModels                = -1;
+     pl->partitionData[i]->protUseEmpiricalFreqs     = -1;
+     pl->partitionData[i]->maxTipStates              = pLengths[pi->dataType].undetermined + 1;
+     pl->partitionData[i]->optimizeBaseFrequencies   = pi->optimizeBaseFrequencies;
+     pl->partitionData[i]->ascBias                   = pi->ascBias;
+     pl->partitionData[i]->parsVect                  = NULL;
+
+
+
+     if (pi->dataType == PLL_AA_DATA)
+      {
+        if(pl->partitionData[i]->protModels != PLL_GTR)
+          pl->partitionData[i]->optimizeSubstitutionRates = PLL_FALSE;
+        pl->partitionData[i]->protUseEmpiricalFreqs     = pi->protUseEmpiricalFreqs;
+        pl->partitionData[i]->protModels                = pi->protModels;
+      }
+
+     pl->partitionData[i]->states                = pLengths[pl->partitionData[i]->dataType].states;
+     pl->partitionData[i]->numberOfCategories    =        1;
+     pl->partitionData[i]->autoProtModels        =        0;
+     pl->partitionData[i]->nonGTR                =        PLL_FALSE;
+     pl->partitionData[i]->partitionContribution =     -1.0;
+     pl->partitionData[i]->partitionLH           =      0.0;
+     pl->partitionData[i]->fracchange            =      1.0;
+     pl->partitionData[i]->executeModel          =     PLL_TRUE;
+
+
+     pl->partitionData[i]->partitionName         = (char *) rax_malloc ((strlen (pi->partitionName) + 1) * sizeof (char));
+     strcpy (pl->partitionData[i]->partitionName, pi->partitionName);
+   }
+
+  return (pl);
+}
+
+
+/** @ingroup instanceLinkingGroup
+    @brief Constructs the proposed partition scheme 
+
+    This function constructs the proposed partition scheme. It assumes
+    that the partition scheme is correct.
+
+    @note This function \b does \b not validate the partition scheme.
+    The user must manually call the ::pllPartitionsValidate function
+    for validation
+    
+    @param parts
+      A list of partitions suggested by the caller
+
+    @param alignmentData
+      The multiple sequence alignment
+
+    @return
+      Returns a pointer to \a partitionList structure of partitions in case of success, \b NULL otherwise
+*/
+partitionList * pllPartitionsCommit (pllQueue * parts, pllAlignmentData * alignmentData)
+{
+  int * oi;
+  int i, j, dst;
+  struct pllQueueItem * elm;
+  struct pllQueueItem * regionItem;
+  pllPartitionRegion * region;
+  pllPartitionInfo * pi;
+  partitionList * pl;
+  int * newBounds;
+  int k, nparts;
+  int tmpvar;
+ 
+
+  dst = k = 0;
+  oi  = (int *) rax_malloc (alignmentData->sequenceLength * sizeof (int));
+  for (i = 0; i < alignmentData->sequenceLength; ++ i) oi[i] = i;
+
+  nparts = pllQueueSize (parts);
+  newBounds = (int *) rax_malloc (2 * nparts * sizeof (int));
+
+  /* reposition the sites in the alignment */
+  for (elm = parts->head; elm; elm = elm->next, ++ k)
+   {
+     pi = (pllPartitionInfo *) elm->item;
+     
+     newBounds[k << 1] = dst;   /* set the lower column for this partition */
+     for (regionItem = pi->regionList->head; regionItem; regionItem = regionItem->next)
+      {
+        region = (pllPartitionRegion *) regionItem->item;
+
+        for (i = region->start - 1; i < region->end && i < alignmentData->sequenceLength; i += region->stride)
+         {
+           if (oi[i] == i)
+            {
+              swapSite (alignmentData->sequenceData, dst, i, alignmentData->sequenceCount);
+              tmpvar = oi[i];
+              oi[i] = oi[dst];
+              oi[dst++] = tmpvar;
+            }
+           else
+            {
+              j = i;
+              while (oi[j] != i) j = oi[j];
+
+              swapSite (alignmentData->sequenceData, dst, j, alignmentData->sequenceCount);
+              tmpvar = oi[j];
+              oi[j] = oi[dst];
+              oi[dst++] = tmpvar;
+            }
+         }
+      }
+     newBounds[(k << 1) + 1] = dst;    /* set the uppwer limit for this partition */
+   }
+  if ((pl = createPartitions (parts, newBounds)))
+   { 
+     pl->numberOfPartitions = nparts;
+     pl->dirty = PLL_FALSE;
+   }
+  
+  rax_free (newBounds);
+  rax_free (oi);
+
+  return (pl);
+}
+
+/** @brief Copy a site to another buffer
+
+    Copies site \a from from buffer \a src to \a to in buffer \a dst. Both buffers
+    must consist of \a nTaxa + 1 taxa and the first row contains no information, i.e.
+    it is not accessed.
+
+    @param dst
+      Destination buffer
+
+    @param src
+      Source buffer
+
+    @param to
+      At which position in \a dst to copy the site to
+
+    @param from
+      Which site from \a src to copy
+
+    @param nTaxa
+      Number of taxa, i.e. size of site
+*/
+static __inline void
+copySite (unsigned char ** dst, unsigned char ** src, int to, int from, int nTaxa)
+{
+  int i;
+
+  for (i = 1; i <= nTaxa; ++ i)
+   {
+     dst[i][to] = src[i][from];
+   }
+}
+
+/** @brief Remove duplicate sites from alignment and update weights vector
+
+    Removes duplicate sites from the alignment given the partitions list
+    and updates the weight vector of the alignment and the boundaries
+    (upper, lower, width) for each partition.
+
+    @param alignmentData
+      The multiple sequence alignment
+    
+    @param pl
+      List of partitions
+
+*/
+void 
+pllAlignmentRemoveDups (pllAlignmentData * alignmentData, partitionList * pl)
+{
+  int i, j, k, p;
+  char *** sites;
+  void ** memptr;
+  int ** oi;
+  int dups = 0;
+  int lower;
+
+  /* allocate space for the transposed alignments (sites) for every partition */
+  sites  = (char ***) rax_malloc (pl->numberOfPartitions * sizeof (char **));
+  memptr = (void **)  rax_malloc (pl->numberOfPartitions * sizeof (void *));
+  oi     = (int **)   rax_malloc (pl->numberOfPartitions * sizeof (int *));
+
+  /* transpose the sites by partition */
+  for (p = 0; p < pl->numberOfPartitions; ++ p)
+   {
+     sites[p]  = (char **) rax_malloc (sizeof (char *) * pl->partitionData[p]->width);
+     memptr[p] = rax_malloc (sizeof (char) * (alignmentData->sequenceCount + 1) * pl->partitionData[p]->width);
+
+     for (i = 0; i < pl->partitionData[p]->width; ++ i)
+      {
+        sites[p][i] = (char *) ((char*)memptr[p] + sizeof (char) * i * (alignmentData->sequenceCount + 1));
+      }
+
+     for (i = 0; i < pl->partitionData[p]->width; ++ i)
+      {
+        for (j = 0; j < alignmentData->sequenceCount; ++ j)
+         {
+           sites[p][i][j] = alignmentData->sequenceData[j + 1][pl->partitionData[p]->lower + i]; 
+         }
+        sites[p][i][j] = 0;
+      }
+
+     oi[p] = pllssort1main (sites[p], pl->partitionData[p]->width);
+
+     for (i = 0; i < pl->partitionData[p]->width; ++ i) oi[p][i] = 1;
+
+     for (i = 1; i < pl->partitionData[p]->width; ++ i)
+      {
+        if (! strcmp (sites[p][i], sites[p][i - 1]))
+         {
+           ++ dups;
+           oi[p][i] = 0;
+         }
+      }
+   }
+
+  /* allocate memory for the alignment without duplicates*/
+  rax_free (alignmentData->sequenceData[1]);
+  rax_free (alignmentData->siteWeights);
+
+  alignmentData->sequenceLength = alignmentData->sequenceLength - dups;
+  alignmentData->sequenceData[0] = (unsigned char *) rax_malloc ((alignmentData->sequenceLength + 1) * sizeof (unsigned char) * alignmentData->sequenceCount);
+  for (i = 0; i < alignmentData->sequenceCount; ++ i)
+   {
+     alignmentData->sequenceData[i + 1] = (unsigned char *) (alignmentData->sequenceData[0] + sizeof (unsigned char) * i * (alignmentData->sequenceLength + 1));
+     alignmentData->sequenceData[i + 1][alignmentData->sequenceLength] = 0;
+   }
+
+  alignmentData->siteWeights    = (int *) rax_malloc ((alignmentData->sequenceLength) * sizeof (int));
+  alignmentData->siteWeights[0] = 1;
+
+  /* transpose sites back to alignment */
+  for (p = 0, k = 0; p < pl->numberOfPartitions; ++ p)
+   {
+     lower = k;
+     for (i = 0; i < pl->partitionData[p]->width; ++ i)
+      {
+        if (!oi[p][i])
+         {
+           ++ alignmentData->siteWeights[k - 1];
+         }
+        else
+         {
+           alignmentData->siteWeights[k] = 1;
+           for (j = 0; j < alignmentData->sequenceCount; ++ j)
+            {
+              alignmentData->sequenceData[j + 1][k] = sites[p][i][j];
+            }
+           ++ k;
+         }
+      }
+     pl->partitionData[p]->lower = lower;
+     pl->partitionData[p]->upper = k;
+     pl->partitionData[p]->width = k - lower;
+   }
+
+  /* deallocate storage for transposed alignment (sites) */
+  for (p = 0; p < pl->numberOfPartitions; ++ p)
+   {
+     rax_free (oi[p]);
+     rax_free (memptr[p]);
+     rax_free (sites[p]);
+   }
+  rax_free (oi);
+  rax_free (sites);
+  rax_free (memptr);
+}
+
+
+/** @brief Compute the empirical frequencies of a partition
+  
+    Compute the empirical frequencies of partition \a partition and store them in
+    \a pfreqs.
+
+    @param partition
+      The partition for which to compute empirical frequencies
+
+    @param alignmentData
+      The multiple sequence alignment
+
+    @param smoothFrequencies
+      Not needed?
+
+    @param bitMask
+      The bitmask
+
+    @param pfreqs
+      Array of size \a partition->states where the empirical frequencies for this partition are stored
+*/
+static int genericBaseFrequenciesAlignment (pInfo * partition, 
+                                              pllAlignmentData * alignmentData, 
+                                              pllBoolean smoothFrequencies,
+                                              const unsigned int * bitMask, 
+                                              double * pfreqs)
+{
+  double 
+    wj, 
+    acc,
+    sumf[64],   
+    temp[64];
+ 
+  int     
+    i, 
+    j, 
+    k, 
+    l,
+    numFreqs,
+    lower,
+    upper;
+
+  unsigned char  *yptr;  
+  const char * map;
+  
+  switch (partition->dataType)
+   {
+     case PLL_BINARY_DATA:
+       map = PLL_MAP_BIN;
+     case PLL_DNA_DATA:
+       map = PLL_MAP_NT;
+       break;
+     case PLL_AA_DATA:
+       map = PLL_MAP_AA;
+       break;
+     default:
+       assert(0);
+   }
+
+  numFreqs = partition->states;
+  lower    = partition->lower;
+  upper    = partition->upper;
+
+  for(l = 0; l < numFreqs; l++)     
+    pfreqs[l] = 1.0 / ((double)numFreqs);
+          
+  for (k = 1; k <= 8; k++) 
+    {                                                   
+      for(l = 0; l < numFreqs; l++)
+        sumf[l] = 0.0;
+              
+      for (i = 1; i <= alignmentData->sequenceCount; i++) 
+        {                
+          yptr = alignmentData->sequenceData[i];
+          
+          for(j = lower; j < upper; j++) 
+            {
+              if (map[yptr[j]] < 0) return (0);
+              unsigned int code = bitMask[(unsigned char)map[yptr[j]]];
+              assert(code >= 1);
+              
+              for(l = 0; l < numFreqs; l++)
+                {
+                  if((code >> l) & 1)
+                    temp[l] = pfreqs[l];
+                  else
+                    temp[l] = 0.0;
+                }                             
+              
+              for(l = 0, acc = 0.0; l < numFreqs; l++)
+                {
+                  if(temp[l] != 0.0)
+                    acc += temp[l];
+                }
+              
+              wj = alignmentData->siteWeights[j] / acc;
+              
+              for(l = 0; l < numFreqs; l++)
+                {
+                  if(temp[l] != 0.0)                
+                    sumf[l] += wj * temp[l];                                                                                               
+                }
+            }
+        }                     
+      
+      for(l = 0, acc = 0.0; l < numFreqs; l++)
+        {
+          if(sumf[l] != 0.0)
+            acc += sumf[l];
+        }
+              
+      for(l = 0; l < numFreqs; l++)
+        pfreqs[l] = sumf[l] / acc;           
+    }
+
+   /* TODO: What is that? */
+/*
+  if(smoothFrequencies)         
+   {;
+    smoothFreqs(numFreqs, pfreqs,  tr->partitionData[model].frequencies, &(tr->partitionData[model]));     
+   }
+  else    
+    {
+      pllBoolean
+        zeroFreq = PLL_FALSE;
+
+      char 
+        typeOfData[1024];
+
+      getDataTypeString(tr, model, typeOfData);  
+
+      for(l = 0; l < numFreqs; l++)
+        {
+          if(pfreqs[l] == 0.0)
+            {
+              printBothOpen("Empirical base frequency for state number %d is equal to zero in %s data partition %s\n", l, typeOfData, tr->partitionData[model].partitionName);
+              printBothOpen("Since this is probably not what you want to do, RAxML will soon exit.\n\n");
+              zeroFreq = PLL_TRUE;
+            }
+        }
+
+      if(zeroFreq)
+        exit(-1);
+
+      for(l = 0; l < numFreqs; l++)
+        {
+          assert(pfreqs[l] > 0.0);
+          tr->partitionData[model].frequencies[l] = pfreqs[l];
+        }     
+    }  
+*/
+  return (1);
+  
+}
+
+static void  genericBaseFrequenciesInstance (pInfo * partition, 
+                                             pllInstance * tr, 
+                                             pllBoolean smoothFrequencies,
+                                             const unsigned int * bitMask, 
+                                             double * pfreqs)
+{
+  double 
+    wj, 
+    acc,
+    sumf[64],   
+    temp[64];
+ 
+  int     
+    i, 
+    j, 
+    k, 
+    l,
+    numFreqs,
+    lower,
+    upper;
+
+  unsigned char  *yptr;  
+
+  numFreqs = partition->states;
+  lower    = partition->lower;
+  upper    = partition->upper;
+
+  for(l = 0; l < numFreqs; l++)     
+    pfreqs[l] = 1.0 / ((double)numFreqs);
+          
+  for (k = 1; k <= 8; k++) 
+    {                                                   
+      for(l = 0; l < numFreqs; l++)
+        sumf[l] = 0.0;
+              
+      for (i = 1; i <= tr->mxtips; i++) 
+        {                
+          yptr = tr->yVector[i];
+          
+          for(j = lower; j < upper; j++) 
+            {
+              unsigned int code = bitMask[yptr[j]];
+              assert(code >= 1);
+              
+              for(l = 0; l < numFreqs; l++)
+                {
+                  if((code >> l) & 1)
+                    temp[l] = pfreqs[l];
+                  else
+                    temp[l] = 0.0;
+                }                             
+              
+              for(l = 0, acc = 0.0; l < numFreqs; l++)
+                {
+                  if(temp[l] != 0.0)
+                    acc += temp[l];
+                }
+              
+              wj = tr->aliaswgt[j] / acc;
+              
+              for(l = 0; l < numFreqs; l++)
+                {
+                  if(temp[l] != 0.0)                
+                    sumf[l] += wj * temp[l];                                                                                               
+                }
+            }
+        }                     
+      
+      for(l = 0, acc = 0.0; l < numFreqs; l++)
+        {
+          if(sumf[l] != 0.0)
+            acc += sumf[l];
+        }
+              
+      for(l = 0; l < numFreqs; l++)
+        pfreqs[l] = sumf[l] / acc;           
+    }
+
+   /* TODO: What is that? */
+/*
+  if(smoothFrequencies)         
+   {;
+    smoothFreqs(numFreqs, pfreqs,  tr->partitionData[model].frequencies, &(tr->partitionData[model]));     
+   }
+  else    
+    {
+      pllBoolean
+        zeroFreq = PLL_FALSE;
+
+      char 
+        typeOfData[1024];
+
+      getDataTypeString(tr, model, typeOfData);  
+
+      for(l = 0; l < numFreqs; l++)
+        {
+          if(pfreqs[l] == 0.0)
+            {
+              printBothOpen("Empirical base frequency for state number %d is equal to zero in %s data partition %s\n", l, typeOfData, tr->partitionData[model].partitionName);
+              printBothOpen("Since this is probably not what you want to do, RAxML will soon exit.\n\n");
+              zeroFreq = PLL_TRUE;
+            }
+        }
+
+      if(zeroFreq)
+        exit(-1);
+
+      for(l = 0; l < numFreqs; l++)
+        {
+          assert(pfreqs[l] > 0.0);
+          tr->partitionData[model].frequencies[l] = pfreqs[l];
+        }     
+    }  
+*/
+
+  
+}
+
+/**  Compute the empirical base frequencies of an alignment
+
+     Computes the empirical base frequencies per partition of an alignment \a alignmentData
+     given the partition structure \a pl.
+
+     @param alignmentData The alignment structure for which to compute the empirical base frequencies
+     @param pl            List of partitions
+     @return Returns a list of frequencies for each partition
+*/
+double ** pllBaseFrequenciesAlignment (pllAlignmentData * alignmentData, partitionList * pl)
+{
+  int
+    i,
+    model;
+
+  double 
+    **freqs = (double **) rax_malloc (pl->numberOfPartitions * sizeof (double *));
+
+  for (model = 0; model < pl->numberOfPartitions; ++ model)
+    {
+      freqs[model] = (double *) rax_malloc (pl->partitionData[model]->states * sizeof (double));
+      
+      switch  (pl->partitionData[model]->dataType)
+        {
+        case PLL_BINARY_DATA:
+        case PLL_AA_DATA:
+        case PLL_DNA_DATA:
+          if (!genericBaseFrequenciesAlignment (pl->partitionData[model], 
+                                                alignmentData, 
+                                                pLengths[pl->partitionData[model]->dataType].smoothFrequencies,
+                                                pLengths[pl->partitionData[model]->dataType].bitVector,
+                                                freqs[model]
+                                               ))
+            return (NULL);
+          break;
+        default:
+          {
+            errno = PLL_UNKNOWN_MOLECULAR_DATA_TYPE;
+            for (i = 0; i <= model; ++ i) rax_free (freqs[i]);
+            rax_free (freqs);
+            return (double **)NULL;
+          }
+        }
+    }
+  
+  return (freqs);
+}
+
+/**  Compute the empirical base frequencies of the alignment incorporated in the instance
+
+     Computes the empirical base frequencies per partition of the alignment
+     incorporated in the instance \a tr given the partition structure \a pl.
+
+     @param tr The instance for which to compute the empirical base frequencies
+     @param pl List of partitions
+     @return Returns a list of frequencies for each partition
+*/
+double ** pllBaseFrequenciesInstance (pllInstance * tr, partitionList * pl)
+{
+  int
+    i,
+    model;
+
+  double 
+    **freqs = (double **) rax_malloc (pl->numberOfPartitions * sizeof (double *));
+
+  for (model = 0; model < pl->numberOfPartitions; ++ model)
+    {
+      freqs[model] = (double *) rax_malloc (pl->partitionData[model]->states * sizeof (double));
+      
+      switch  (pl->partitionData[model]->dataType)
+        {
+        case PLL_AA_DATA:
+        case PLL_DNA_DATA:
+        case PLL_BINARY_DATA:
+          genericBaseFrequenciesInstance (pl->partitionData[model], 
+                                          tr, 
+                                          pLengths[pl->partitionData[model]->dataType].smoothFrequencies,
+                                          pLengths[pl->partitionData[model]->dataType].bitVector,
+                                          freqs[model]
+                                          );
+          break;
+        default:
+          {
+            errno = PLL_UNKNOWN_MOLECULAR_DATA_TYPE;
+            for (i = 0; i <= model; ++ i) rax_free (freqs[i]);
+            rax_free (freqs);
+            return (double **)NULL;
+          }
+        }
+    }
+  
+  return (freqs);
+}
+
+void
+pllEmpiricalFrequenciesDestroy (double *** empiricalFrequencies, int models)
+{
+  int i;
+
+  for (i = 0; i < models; ++ i)
+   {
+     rax_free ((*empiricalFrequencies)[i]);
+   }
+  rax_free (*empiricalFrequencies);
+
+  *empiricalFrequencies = NULL;
+}
+
+int pllLoadAlignment (pllInstance * tr, pllAlignmentData * alignmentData, partitionList * partitions)
+{
+  int i;
+  nodeptr node;
+  pllHashItem * hItem;
+
+  if (tr->mxtips != alignmentData->sequenceCount) return (0);
+
+  tr->aliaswgt = (int *) rax_malloc (alignmentData->sequenceLength * sizeof (int));
+  memcpy (tr->aliaswgt, alignmentData->siteWeights, alignmentData->sequenceLength * sizeof (int));
+
+  tr->originalCrunchedLength = alignmentData->sequenceLength;
+  tr->rateCategory           = (int *)   rax_calloc (tr->originalCrunchedLength, sizeof (int));
+  tr->patrat                 = (double*) rax_malloc((size_t)tr->originalCrunchedLength * sizeof(double));
+  tr->patratStored           = (double*) rax_malloc((size_t)tr->originalCrunchedLength * sizeof(double));
+  tr->lhs                    = (double*) rax_malloc((size_t)tr->originalCrunchedLength * sizeof(double));
+
+  /* allocate memory for the alignment */
+  tr->yVector    = (unsigned char **) rax_malloc ((alignmentData->sequenceCount + 1) * sizeof (unsigned char *));                                                                                                                                                                      
+
+  tr->yVector[0] = (unsigned char *)  rax_malloc (sizeof (unsigned char) * (alignmentData->sequenceLength + 1) * alignmentData->sequenceCount);
+  for (i = 1; i <= alignmentData->sequenceCount; ++ i) 
+   {                     
+     tr->yVector[i] = (unsigned char *) (tr->yVector[0] + (i - 1) * (alignmentData->sequenceLength + 1) * sizeof (unsigned char));
+     tr->yVector[i][alignmentData->sequenceLength] = 0;
+   }                     
+                         
+  /* place sequences to tips */                              
+  for (i = 1; i <= alignmentData->sequenceCount; ++ i)                      
+   {                     
+     if (!pllHashSearch (tr->nameHash, alignmentData->sequenceLabels[i],(void **)&node)) 
+      {
+        //rax_free (tr->originalCrunchedLength);
+        rax_free (tr->rateCategory);
+        rax_free (tr->patrat);
+        rax_free (tr->patratStored);
+        rax_free (tr->lhs);
+        rax_free (tr->yVector[0]);
+        rax_free (tr->yVector);
+        return (0);
+      }
+     memcpy (tr->yVector[node->number], alignmentData->sequenceData[i], alignmentData->sequenceLength);
+   }
+
+  /* Do the base substitution (from A,C,G....  ->   0,1,2,3....)*/
+  pllBaseSubstitute (tr, partitions);
+
+  /* Populate tipNames */
+  tr->tipNames = (char **) rax_calloc(tr->mxtips + 1, sizeof (char *));
+  for (i = 0; (unsigned int)i < tr->nameHash->size; ++ i)
+   {
+     hItem = tr->nameHash->Items[i];
+
+     for (; hItem; hItem = hItem->next)
+      {
+        tr->tipNames[((nodeptr)hItem->data)->number] = hItem->str; 
+      }
+   }
+
+  return (1);
+}
+
+pllInstance * pllCreateInstance (pllInstanceAttr * attr)
+{
+  pllInstance * tr;
+
+  if (attr->rateHetModel != PLL_GAMMA && attr->rateHetModel != PLL_CAT) return NULL;
+
+  tr = (pllInstance *) rax_calloc (1, sizeof (pllInstance));
+
+  tr->threadID          = 0;
+  tr->rateHetModel      = attr->rateHetModel;
+  tr->fastScaling       = attr->fastScaling;
+  tr->saveMemory        = attr->saveMemory;
+  tr->useRecom          = attr->useRecom;
+  tr->likelihoodEpsilon = 0.01;
+  
+  tr->randomNumberSeed = attr->randomNumberSeed;
+  tr->parsimonyScore   = NULL;
+
+  /* remove it from the library */
+  tr->useMedian         = PLL_FALSE;
+
+  tr->maxCategories     = (attr->rateHetModel == PLL_GAMMA) ? 4 : 25;
+
+  tr->numberOfThreads   = attr->numberOfThreads;
+  tr->rearrangeHistory  = NULL;
+
+  /* Lock the slave processors at this point */
+#ifdef _FINE_GRAIN_MPI
+  pllLockMPI (tr);
+#endif
+
+  return (tr);
+}
+
+/** @brief Initialize PLL tree structure with default values
+    
+    Initialize PLL tree structure with default values and allocate 
+    memory for its elements.
+
+    @todo
+      STILL NOT FINISHED
+*/
+static void pllTreeInitDefaults (pllInstance * tr, int tips)
+{
+  nodeptr p0, p, q;
+  int i, j;
+  int inner;
+
+  
+
+  /* TODO: make a proper static setupTree function */
+
+  inner = tips - 1;
+
+  tr->mxtips = tips;
+
+  tr->bigCutoff = PLL_FALSE;
+  tr->treeStringLength = tr->mxtips * (PLL_NMLNGTH + 128) + 256 + tr->mxtips * 2;
+  tr->tree_string = (char *) rax_calloc ( tr->treeStringLength, sizeof(char));
+  tr->tree0 = (char*)rax_calloc((size_t)tr->treeStringLength, sizeof(char));
+  tr->tree1 = (char*)rax_calloc((size_t)tr->treeStringLength, sizeof(char));
+  tr->constraintVector = (int *)rax_malloc((2 * tr->mxtips) * sizeof(int));
+  
+  p0 = (nodeptr) rax_malloc ((tips + 3 * inner) * sizeof (node));
+  assert (p0);
+
+  tr->nodeBaseAddress  = p0;
+
+  tr->nameList         = (char **)   rax_malloc ((tips + 1) * sizeof (char *));
+  tr->nodep            = (nodeptr *) rax_malloc ((2 * tips) * sizeof (nodeptr));
+
+  tr->autoProteinSelectionType = PLL_AUTO_ML;
+
+  assert (tr->nameList && tr->nodep);
+
+  tr->nodep[0] = NULL;          
+
+
+  /* TODO: The line below was commented... why? */
+  tr->fracchange = -1;
+  tr->rawFracchange = -1;
+
+  for (i = 1; i <= tips; ++ i)
+   {
+     p = p0++;
+
+     //p->hash      = KISS32();     
+     p->x         = 0;
+     p->xBips     = 0;
+     p->number    = i;
+     p->next      = p;
+     p->back      = NULL;
+     p->bInf      = NULL;
+     tr->nodep[i]  = p;
+   }
+
+  for (i = tips + 1; i <= tips + inner; ++i)
+   {
+     q = NULL;
+     for (j = 1; j <= 3; ++ j)
+     {
+       p = p0++;
+       if (j == 1)
+        {
+          p->xBips = 1;
+          p->x = 1; //p->x     = 1;
+        }
+       else
+        {
+          p->xBips = 0;
+          p->x     = 0;
+        }
+       p->number = i;
+       p->next   = q;
+       p->bInf   = NULL;
+       p->back   = NULL;
+       p->hash   = 0;
+       q         = p;
+     }
+    p->next->next->next = p;
+    tr->nodep[i]         = p;
+   }
+
+  tr->likelihood  = PLL_UNLIKELY;
+  tr->start       = NULL;
+  tr->ntips       = 0;
+  tr->nextnode    = 0;
+
+  for (i = 0; i < PLL_NUM_BRANCHES; ++ i) tr->partitionSmoothed[i] = PLL_FALSE;
+
+  tr->bitVectors = NULL;
+  tr->vLength    = 0;
+  //tr->h          = NULL;
+
+  /* TODO: Fix hash type */
+  tr->nameHash   = pllHashInit (10 * tr->mxtips);
+
+  /* TODO: do these options really fit here or should they be put elsewhere? */
+  tr->td[0].count            = 0;
+  tr->td[0].ti               = (traversalInfo *) rax_malloc (sizeof(traversalInfo) * (size_t)tr->mxtips);
+  tr->td[0].parameterValues  = (double *) rax_malloc(sizeof(double) * (size_t)PLL_NUM_BRANCHES);
+  tr->td[0].executeModel     = (pllBoolean *) rax_malloc (sizeof(pllBoolean) * (size_t)PLL_NUM_BRANCHES);
+  tr->td[0].executeModel[0]  = PLL_TRUE;                                                                                                                                                                                                                                    
+  for (i = 0; i < PLL_NUM_BRANCHES; ++ i) tr->td[0].executeModel[i] = PLL_TRUE;
+}
+
+
+/* @brief Check a parsed tree for inclusion in the current tree
+   
+   Check whether the set of leaves (taxa) of the parsed tree \a nTree is a
+   subset of the leaves of the currently loaded tree.
+
+   @param pInst
+     PLL instance
+
+   @param nTree
+     Parsed newick tree structure
+
+   @return
+     Returns \b PLL_TRUE in case it is a subset, otherwise \b PLL_FALSE
+*/
+static int
+checkTreeInclusion (pllInstance * pInst, pllNewickTree * nTree)
+{
+  pllStack * sList;
+  pllNewickNodeInfo * sItem;
+  void * dummy;
+
+  if (!pInst->nameHash) return (PLL_FALSE);
+
+  for (sList = nTree->tree; sList; sList = sList->next)
+   {
+     sItem = (pllNewickNodeInfo *) sList->item;
+     if (!sItem->rank)   /* leaf */
+      {
+        if (!pllHashSearch (pInst->nameHash, sItem->name, &dummy)) return (PLL_FALSE);
+      }
+   }
+
+  return (PLL_TRUE);
+}
+
+static void
+updateBranchLength (nodeptr p, double old_fracchange, double new_fracchange)
+{
+  double z;
+  int j;
+
+  for (j = 0; j < PLL_NUM_BRANCHES; ++ j)
+   {
+     z = exp ((log (p->z[j]) * old_fracchange) / new_fracchange);
+     if (z < PLL_ZMIN) z = PLL_ZMIN;
+     if (z > PLL_ZMAX) z = PLL_ZMAX;
+     p->z[j] = p->back->z[j] = z;
+   }
+}
+
+static void
+updateAllBranchLengthsRecursive (nodeptr p, int tips, double old_fracchange, double new_fracchange)
+{
+  updateBranchLength (p, old_fracchange, new_fracchange);
+
+  if (!isTip (p->number, tips))
+   {
+     updateAllBranchLengthsRecursive (p->next->back,       tips, old_fracchange, new_fracchange);
+     updateAllBranchLengthsRecursive (p->next->next->back, tips, old_fracchange, new_fracchange);
+   }
+}
+
+static void
+updateAllBranchLengths (pllInstance * tr, double old_fracchange, double new_fracchange)
+{
+  nodeptr p;
+
+  p = tr->start;
+  assert (isTip(p->number, tr->mxtips));
+
+  updateAllBranchLengthsRecursive (p->back, tr->mxtips, old_fracchange, new_fracchange);
+
+}
+
+
+/** @brief Relink the taxa
+    
+    Relink the taxa by performing a preorder traversal of the unrooted binary tree.
+    We assume that the tree is rooted such that the root is the only node of
+    out-degree 3 and in-degree 0, while all the other inner nodes have in-degree
+    1 and out-degree 2. Finally, the leaves have in-degree 1 and out-degree 0.
+
+    @param pInst
+      PLL instance
+
+    @param nTree
+      Parsed newick tree structure
+
+    @param taxaExist
+      Is the set of taxa of \a nTree a subset of the taxa of the current tree
+
+    @return
+*/
+static int
+linkTaxa (pllInstance * pInst, pllNewickTree * nTree, int taxaExist)
+{
+  nodeptr 
+    parent,
+    child;
+  pllStack 
+    * nodeStack = NULL,
+    * current;
+  int
+    i,
+    j,
+    inner = nTree->tips + 1,
+    leaf  = 1;
+  double z;
+  pllNewickNodeInfo * nodeInfo;
+
+  if (!taxaExist) pllTreeInitDefaults (pInst, nTree->tips);
+
+  /* Place the ternary root node 3 times on the stack such that later on
+     three nodes use it as their parent */
+  current = nTree->tree;
+  for (parent = pInst->nodep[inner], i  = 0; i < 3; ++ i, parent = parent->next)
+    pllStackPush (&nodeStack, parent);
+  ++ inner;
+
+  /* now traverse the rest of the nodes */
+  for (current = current->next; current; current = current->next)
+   {
+     parent   = (nodeptr) pllStackPop (&nodeStack);
+     nodeInfo = (pllNewickNodeInfo *) current->item;
+
+     /* if inner node place it twice on the stack (out-degree 2) */
+     if (nodeInfo->rank)
+      {
+        child = pInst->nodep[inner ++];
+        pllStackPush (&nodeStack, child->next);
+        pllStackPush (&nodeStack, child->next->next);
+      }
+     else /* check if taxon already exists, i.e. we loaded another tree topology */
+      {
+        if (taxaExist)
+         {
+           assert (pllHashSearch (pInst->nameHash, nodeInfo->name, (void **) &child));
+         }
+        else
+         {
+           child = pInst->nodep[leaf];
+           pInst->nameList[leaf] = strdup (nodeInfo->name);
+           pllHashAdd (pInst->nameHash, pllHashString(pInst->nameList[leaf], pInst->nameHash->size), pInst->nameList[leaf], (void *) (pInst->nodep[leaf]));
+           ++ leaf;
+         }
+      }
+     assert (parent);
+     /* link parent and child */
+     parent->back = child;
+     child->back  = parent;
+
+     if (!taxaExist) pInst->fracchange = 1;
+
+     /* set the branch length */
+     z = exp ((-1 * atof (nodeInfo->branch)) / pInst->fracchange);
+     if (z < PLL_ZMIN) z = PLL_ZMIN;
+     if (z > PLL_ZMAX) z = PLL_ZMAX;
+     for (j = 0; j < PLL_NUM_BRANCHES; ++ j)
+       parent->z[j] = child->z[j] = z;
+   }
+  pllStackClear (&nodeStack);
+
+  return PLL_TRUE;
+}
+
+/** @brief Get the instantaneous rate matrix
+    
+    Obtain the instantaneous rate matrix (Q) for partitionm \a model
+    of the partition list \a pr, and store it in an array \a outBuffer.
+    
+    @param tr        PLL instance
+    @param pr        List of partitions
+    @param model     Index of partition to use
+    @param outBuffer Where to store the instantaneous rate matrix 
+
+    @todo Currently, the Q matrix can be only obtained for DNA GTR data.
+
+    @return Returns \b PLL_TRUE in case of success, otherwise \b PLL_FALSE
+*/
+int pllGetInstRateMatrix (partitionList * pr, int model, double * outBuffer)
+{
+  if (pr->partitionData[model]->dataType != PLL_DNA_DATA) return (PLL_FALSE);
+
+  int  i;
+  double mean = 0;
+  double * substRates = pr->partitionData[model]->substRates;
+  double * freqs = pr->partitionData[model]->frequencies;
+  
+  /* normalize substitution rates */
+  for (i = 0; i < 6; ++ i)  substRates[i] /= substRates[5];
+
+  outBuffer[0 * 4 + 1] = (substRates[0] * freqs[1]);
+  outBuffer[0 * 4 + 2] = (substRates[1] * freqs[2]);
+  outBuffer[0 * 4 + 3] = (substRates[2] * freqs[3]);
+
+  outBuffer[1 * 4 + 0] = (substRates[0] * freqs[0]);
+  outBuffer[1 * 4 + 2] = (substRates[3] * freqs[2]);
+  outBuffer[1 * 4 + 3] = (substRates[4] * freqs[3]);
+
+  outBuffer[2 * 4 + 0] = (substRates[1] * freqs[0]);
+  outBuffer[2 * 4 + 1] = (substRates[3] * freqs[1]);
+  outBuffer[2 * 4 + 3] = (substRates[5] * freqs[3]);
+
+  outBuffer[3 * 4 + 0] = (substRates[2] * freqs[0]);
+  outBuffer[3 * 4 + 1] = (substRates[4] * freqs[1]);
+  outBuffer[3 * 4 + 2] = (substRates[5] * freqs[2]);
+
+  outBuffer[0 * 4 + 0] = -(substRates[0] * freqs[1] + substRates[1] * freqs[2] + substRates[2] * freqs[3]);
+  outBuffer[1 * 4 + 1] = -(substRates[0] * freqs[0] + substRates[3] * freqs[2] + substRates[4] * freqs[3]);
+  outBuffer[2 * 4 + 2] = -(substRates[1] * freqs[0] + substRates[3] * freqs[1] + substRates[5] * freqs[3]);
+  outBuffer[3 * 4 + 3] = -(substRates[2] * freqs[0] + substRates[4] * freqs[1] + substRates[5] * freqs[2]);
+
+  for (i = 0; i <  4; ++ i) mean         += freqs[i] * (-outBuffer[i * 4 + i]);
+  for (i = 0; i < 16; ++ i) outBuffer[i] /= mean;
+
+  return (PLL_TRUE);
+}
+
+/** @ingroup instanceLinkingGroup
+    @brief Initializes the PLL tree topology according to a parsed newick tree
+
+    Set the tree topology based on a parsed and validated newick tree
+
+    @param tree
+      The PLL instance
+
+    @param nt
+      The \a pllNewickTree wrapper structure that contains the parsed newick tree
+
+    @param useDefaultz
+      If set to \b PLL_TRUE then the branch lengths will be reset to the default
+      value.
+*/
+void
+pllTreeInitTopologyNewick (pllInstance * tr, pllNewickTree * newick, int useDefaultz)
+{
+  linkTaxa (tr, newick, tr->nameHash && checkTreeInclusion (tr, newick));
+
+  tr->start = tr->nodep[1];
+
+  if (useDefaultz == PLL_TRUE)
+    resetBranches (tr);
+}
+
+/** @brief Get the node oriented pointer from a round-about node
+
+    Returns the pointer of the round-about node $p$ that has the orientation, i.e.
+    has the \a x flag set to 1. In case a tip is passed, then the returned pointer
+    is the same as the input.
+
+    @param pInst  PLL instance
+    @param p      One of the three pointers of a round-about node
+
+    @return  Returns the the pointer that has the orientation
+*/
+nodeptr pllGetOrientedNodePointer (pllInstance * pInst, nodeptr p)
+{
+  if (p->number <= pInst->mxtips || p->x) return p;
+
+  if (p->next->x) return p->next;
+
+  return p->next->next;
+}
+
+
+//void
+//pllTreeInitTopologyNewick (pllInstance * tr, pllNewickTree * nt, int useDefaultz)
+//{
+//  pllStack * nodeStack = NULL;
+//  pllStack * head;
+//  pllNewickNodeInfo * item;
+//  int i, j, k;
+//  
+///*
+//  for (i = 0; i < partitions->numberOfPartitions; ++ i)
+//   {
+//     partitions->partitionData[i] = (pInfo *) rax_malloc (sizeof (pInfo));
+//     partitions->partitionData[i]->partitionContribution = -1.0;
+//     partitions->partitionData[i]->partitionLH           =  0.0;
+//     partitions->partitionData[i]->fracchange            =  1.0;
+//   }
+//*/
+// 
+//
+// if (tr->nameHash)
+//  {
+//    if (checkTreeInclusion (tr, nt))
+//     {
+//       printf ("It is a subset\n");
+//     }
+//    else
+//     {
+//       printf ("It is not a subset\n");
+//     }
+//  }
+//  
+//  pllTreeInitDefaults (tr, nt->tips);
+//
+//  i = nt->tips + 1;
+//  j = 1;
+//  nodeptr v;
+//  
+//  
+//  for (head = nt->tree; head; head = head->next)
+//  {
+//    item = (pllNewickNodeInfo *) head->item;
+//    if (!nodeStack)
+//     {
+//       pllStackPush (&nodeStack, tr->nodep[i]);
+//       pllStackPush (&nodeStack, tr->nodep[i]->next);
+//       pllStackPush (&nodeStack, tr->nodep[i]->next->next);
+//       ++i;
+//     }
+//    else
+//     {
+//       v = (nodeptr) pllStackPop (&nodeStack);
+//       if (item->rank)  /* internal node */
+//        {
+//          v->back           = tr->nodep[i];
+//          tr->nodep[i]->back = v; //t->nodep[v->number]
+//          pllStackPush (&nodeStack, tr->nodep[i]->next);
+//          pllStackPush (&nodeStack, tr->nodep[i]->next->next);
+//          double z = exp((-1 * atof(item->branch))/tr->fracchange);
+//          if(z < PLL_ZMIN) z = PLL_ZMIN;
+//          if(z > PLL_ZMAX) z = PLL_ZMAX;
+//          for (k = 0; k < PLL_NUM_BRANCHES; ++ k)
+//             v->z[k] = tr->nodep[i]->z[k] = z;
+//
+//          ++ i;
+//        }
+//       else             /* leaf */
+//        {
+//          v->back           = tr->nodep[j];
+//          tr->nodep[j]->back = v; //t->nodep[v->number];
+//
+//          double z = exp((-1 * atof(item->branch))/tr->fracchange);
+//          if(z < PLL_ZMIN) z = PLL_ZMIN;
+//          if(z > PLL_ZMAX) z = PLL_ZMAX;
+//          for (k = 0; k < PLL_NUM_BRANCHES; ++ k)
+//            v->z[k] = tr->nodep[j]->z[k] = z;
+//            
+//          //t->nameList[j] = strdup (item->name);
+//          tr->nameList[j] = (char *) rax_malloc ((strlen (item->name) + 1) * sizeof (char));
+//          strcpy (tr->nameList[j], item->name);
+//          
+//          pllHashAdd (tr->nameHash, tr->nameList[j], (void *) (tr->nodep[j]));
+//          ++ j;
+//        }
+//     }
+//  }
+//  
+//  tr->start = tr->nodep[1];
+//  
+//  pllStackClear (&nodeStack);
+//
+//  if (useDefaultz == PLL_TRUE) 
+//    resetBranches (tr);
+//}
+
+/** @brief Initialize PLL tree with a random topology
+
+    Initializes the PLL tree with a randomly created topology
+
+    @todo
+      Perhaps pass a seed?
+
+    @param tr
+      The PLL instance
+
+    @param tips
+      Number of tips
+
+    @param nameList
+      A set of \a tips names representing the taxa labels
+*/
+void 
+pllTreeInitTopologyRandom (pllInstance * tr, int tips, char ** nameList)
+{
+  int i;
+  pllTreeInitDefaults (tr, tips);
+
+  for (i = 1; i <= tips; ++ i)
+   {
+     tr->nameList[i] = (char *) rax_malloc ((strlen (nameList[i]) + 1) * sizeof (char));
+     strcpy (tr->nameList[i], nameList[i]);
+     pllHashAdd (tr->nameHash, pllHashString(tr->nameList[i], tr->nameHash->size), tr->nameList[i], (void *) (tr->nodep[i]));
+   }
+  
+
+  pllMakeRandomTree (tr);
+}
+
+
+/** @brief Initialize a tree that corresponds to a given (already parsed) alignment 
+
+    Initializes the PLL tree such that it corresponds to the given alignment
+
+    @todo
+      nothing 
+
+    @param tr
+      The PLL instance
+
+    @param alignmentData
+      Parsed alignment
+*/
+void 
+pllTreeInitTopologyForAlignment (pllInstance * tr, pllAlignmentData * alignmentData)
+{
+  int
+    tips = alignmentData->sequenceCount,
+    i;
+
+  char 
+    **nameList = alignmentData->sequenceLabels;
+  
+  pllTreeInitDefaults (tr, tips);
+
+  for (i = 1; i <= tips; ++ i)
+   {
+     tr->nameList[i] = (char *) rax_malloc ((strlen (nameList[i]) + 1) * sizeof (char));
+     strcpy (tr->nameList[i], nameList[i]);
+     pllHashAdd (tr->nameHash, pllHashString(tr->nameList[i], tr->nameHash->size), tr->nameList[i], (void *) (tr->nodep[i]));
+   }
+}
+
+
+/** @brief Compute a randomized stepwise addition oder parsimony tree
+
+    Implements the RAxML randomized stepwise addition order algorithm 
+
+    @todo
+      check functions that are invoked for potential memory leaks!
+
+    @param tr
+      The PLL instance
+
+    @param partitions
+      The partitions
+
+    @param sprDist
+      SPR distance for the SPR search in parsimony
+*/
+void pllComputeRandomizedStepwiseAdditionParsimonyTree(pllInstance * tr, partitionList * partitions, int sprDist)
+{
+  allocateParsimonyDataStructures(tr, partitions);
+  pllMakeParsimonyTreeFast(tr, partitions, sprDist);
+  pllFreeParsimonyDataStructures(tr, partitions);
+}
+
+/** @brief Encode the alignment data to the PLL numerical representation
+    
+    Transforms the alignment to the PLL internal representation by substituting each base 
+    with a specific digit.
+
+    @param alignmentData  Multiple sequence alignment
+    @param partitions     List of partitions
+*/
+void pllBaseSubstitute (pllInstance * tr, partitionList * partitions)
+{
+  const char * d;
+  int i, j, k;
+
+  for (i = 0; i < partitions->numberOfPartitions; ++ i)
+   {
+     switch (partitions->partitionData[i]->dataType)
+      {
+        case PLL_DNA_DATA:
+          d = PLL_MAP_NT;
+          break;
+        case PLL_BINARY_DATA:
+          d = PLL_MAP_BIN;
+          break;
+        case PLL_AA_DATA:
+          d = PLL_MAP_AA;
+          break;
+        default:
+          assert(0);
+      }
+     
+     for (j = 1; j <= tr->mxtips; ++ j)
+      {
+        for (k = partitions->partitionData[i]->lower; k < partitions->partitionData[i]->upper; ++ k)
+         {
+           tr->yVector[j][k] = d[tr->yVector[j][k]];
+         }
+      }
+   }
+}
+
+/** Clears the rearrangements history from PLL instance
+    
+    Clears the rearrangements rollback information (history) from the PLL instance \a tr.
+
+    @param tr
+      PLL instance
+*/
+void pllClearRearrangeHistory (pllInstance * tr)
+{
+  pllRollbackInfo * ri;
+
+  while ((ri = (pllRollbackInfo *)pllStackPop (&(tr->rearrangeHistory))))
+   {
+     rax_free (ri);
+   }
+}
+
+/** @brief Deallocate the PLL instance
+
+    Deallocates the library instance and all its elements.
+
+    @param tr
+      The PLL instance
+*/
+void
+pllDestroyInstance (pllInstance * tr)
+{
+  int i;
+
+  for (i = 1; i <= tr->mxtips; ++ i)
+    rax_free (tr->nameList[i]);
+  
+  pllHashDestroy (&(tr->nameHash), NULL);
+  if (tr->yVector)
+   {
+     if (tr->yVector[0]) rax_free (tr->yVector[0]);
+     rax_free (tr->yVector);
+   }
+  rax_free (tr->aliaswgt);
+  rax_free (tr->rateCategory);
+  rax_free (tr->patrat);
+  rax_free (tr->patratStored);
+  rax_free (tr->lhs);
+  rax_free (tr->td[0].parameterValues);
+  rax_free (tr->td[0].executeModel);
+  rax_free (tr->td[0].ti);
+  rax_free (tr->nameList);
+  rax_free (tr->nodep);
+  rax_free (tr->nodeBaseAddress);
+  rax_free (tr->tree_string);
+  rax_free (tr->tree0);
+  rax_free (tr->tree1);
+  rax_free (tr->tipNames);
+  rax_free (tr->constraintVector);
+  pllClearRearrangeHistory (tr);
+
+  rax_free (tr);
+
+#ifdef _FINE_GRAIN_MPI
+  pllFinalizeMPI ();
+#endif
+
+}
+
+/* initializwe a parameter linkage list for a certain parameter type (can be whatever).
+   the input is an integer vector that contaions NumberOfModels (numberOfPartitions) elements.
+
+   if we want to have all alpha parameters unlinked and have say 4 partitions the input 
+   vector would look like this: {0, 1, 2, 3}, if we want to link partitions 0 and 3 the vector 
+   should look like this: {0, 1, 2, 0} 
+*/
+
+
+
+static int init_Q_MatrixSymmetries(char *linkageString, partitionList * pr, int model)
+{
+  int 
+    states = pr->partitionData[model]->states,
+    numberOfRates = ((states * states - states) / 2), 
+    *list = (int *)rax_malloc(sizeof(int) * numberOfRates),
+    j,
+    max = -1;
+
+  char
+    *str1,
+    *saveptr,
+    *ch,
+    *token;
+
+  ch = (char *) rax_malloc (strlen (linkageString) + 1);
+  strcpy (ch, linkageString);
+
+
+  for(j = 0, str1 = ch; ;j++, str1 = (char *)NULL) 
+    {
+      token = STRTOK_R(str1, ",", &saveptr);
+      if(token == (char *)NULL)
+        break;
+      if(!(j < numberOfRates))
+        {
+          errno = PLL_SUBSTITUTION_RATE_OUT_OF_BOUNDS;
+          return PLL_FALSE;
+        }
+      list[j] = atoi(token);     
+    }
+  
+  rax_free(ch);
+
+  for(j = 0; j < numberOfRates; j++)
+    {
+      if(!(list[j] <= j))
+        {
+          errno = PLL_INVALID_Q_MATRIX_SYMMETRY;
+          return PLL_FALSE;
+        }
+      
+      if(!(list[j] <= max + 1))
+        {
+          errno = PLL_Q_MATRIX_SYMMETRY_OUT_OF_BOUNDS;
+          return PLL_FALSE;
+        }
+      
+      if(list[j] > max)
+        max = list[j];
+    }  
+  
+  for(j = 0; j < numberOfRates; j++)  
+    pr->partitionData[model]->symmetryVector[j] = list[j];    
+
+  //less than the maximum possible number of rate parameters
+
+  if(max < numberOfRates - 1)    
+    pr->partitionData[model]->nonGTR = PLL_TRUE;
+
+  pr->partitionData[model]->optimizeSubstitutionRates = PLL_TRUE;
+
+  rax_free(list);
+
+  return PLL_TRUE;
+}
+
+/** @brief Check parameter linkage across partitions for consistency
+ *
+ * Checks that linked alpha, substitution rate and frequency model parameters 
+ * across several partitions are consistent. E.g., when two partitions are linked 
+ * via the alpha parameter, the alpha parameter should either be set to the same 
+ * fixed value or it should be estimated!
+ *
+ * @param pr
+ *   List of partitions
+ *
+ * @todo
+ *   Call this in more functions, right now it's only invoked in the wrapper 
+ *   for modOpt() 
+ */
+static int checkLinkageConsistency(partitionList *pr)
+{
+  if(pr->dirty)
+    {
+      int 
+        i;
+      
+      linkageList 
+        *ll;
+
+      /* first deal with rates */
+
+      ll = pr->rateList;
+        
+      for(i = 0; i < ll->entries; i++)
+        {
+          int
+            partitions = ll->ld[i].partitions,
+            reference = ll->ld[i].partitionList[0];
+          
+          if(pr->partitionData[reference]->dataType == PLL_AA_DATA)
+            {
+              if(pr->partitionData[reference]->protModels == PLL_GTR || pr->partitionData[reference]->nonGTR)                             
+                {
+                  if(!(pr->partitionData[reference]->optimizeSubstitutionRates == PLL_TRUE))
+                    {
+                      errno = PLL_INCONSISTENT_SUBST_RATE_OPTIMIZATION_SETTING;
+                      return PLL_FALSE;
+                    }
+                }
+              else              
+                {
+                  if(!(pr->partitionData[reference]->optimizeSubstitutionRates == PLL_FALSE))
+                    {
+                      errno = PLL_INCONSISTENT_SUBST_RATE_OPTIMIZATION_SETTING;
+                      return PLL_FALSE;
+                    }
+                }                 
+            }
+
+          if(partitions > 1)
+            {
+              int
+                j,
+                k;
+              
+              for(k = 1; k < partitions; k++)
+                {
+                  int 
+                    index = ll->ld[i].partitionList[k];
+                  
+                  int
+                    states = pr->partitionData[index]->states,
+                    rates = ((states * states - states) / 2);
+                  
+                  if(!(pr->partitionData[reference]->nonGTR == pr->partitionData[index]->nonGTR))
+                    {
+                      errno = PLL_INCONSISTENT_SUBST_RATE_OPTIMIZATION_SETTING;
+                      return PLL_FALSE;
+                    }
+                  if(!(pr->partitionData[reference]->optimizeSubstitutionRates == pr->partitionData[index]->optimizeSubstitutionRates))
+                    {
+                      errno = PLL_INCONSISTENT_SUBST_RATE_OPTIMIZATION_SETTING;
+                      return PLL_FALSE;
+                    }
+                
+                  
+                  if(pr->partitionData[reference]->nonGTR)
+                    {              
+                      
+                      for(j = 0; j < rates; j++)                        
+                        {
+                          if(!(pr->partitionData[reference]->symmetryVector[j] == pr->partitionData[index]->symmetryVector[j]))
+                            {
+                              errno = PLL_INCONSISTENT_Q_MATRIX_SYMMETRIES_ACROSS_LINKED_PARTITIONS;
+                              return PLL_FALSE;
+                            }
+                        }
+                    }
+                  
+                 
+                  for(j = 0; j < rates; j++)
+                    {
+                      if(!(pr->partitionData[reference]->substRates[j] == pr->partitionData[index]->substRates[j]))
+                        {
+                          errno = PLL_INCONSISTENT_Q_MATRIX_ENTRIES_ACROSS_LINKED_PARTITIONS;
+                          return PLL_FALSE;
+                        }
+                    }
+                }           
+            }
+        }
+      
+      /* then deal with alpha parameters */
+
+      ll = pr->alphaList;
+
+      for(i = 0; i < ll->entries; i++)
+        {
+          int
+            partitions = ll->ld[i].partitions;
+          
+          if(partitions > 1)
+            {
+              int
+                k, 
+                reference = ll->ld[i].partitionList[0];
+              
+              for(k = 1; k < partitions; k++)
+                {
+                  int 
+                    index = ll->ld[i].partitionList[k];                          
+
+                  if(!(pr->partitionData[reference]->optimizeAlphaParameter == pr->partitionData[index]->optimizeAlphaParameter))
+                    {
+                      errno = PLL_INCONSISTENT_ALPHA_STATES_ACROSS_LINKED_PARTITIONS;
+                      return PLL_FALSE;
+                    }
+                  if(!(pr->partitionData[reference]->alpha == pr->partitionData[index]->alpha))
+                    {
+                      errno = PLL_INCONSISTENT_ALPHA_VALUES_ACROSS_LINKED_PARTITIONS;
+                      return PLL_FALSE;
+                    }
+                }           
+            }
+        }
+
+      /* and then deal with base frequencies */
+
+      ll = pr->freqList;
+
+      for(i = 0; i < ll->entries; i++)
+        {
+          int     
+            partitions = ll->ld[i].partitions;
+          
+          if(partitions > 1)
+            {
+              int               
+                k, 
+                reference = ll->ld[i].partitionList[0];
+              
+              for(k = 1; k < partitions; k++)
+                {
+                  int
+                    j,
+                    index = ll->ld[i].partitionList[k],
+                    states = pr->partitionData[index]->states;                           
+
+                  if(!(pr->partitionData[reference]->optimizeBaseFrequencies == pr->partitionData[index]->optimizeBaseFrequencies))
+                    {
+                      errno = PLL_INCONSISTENT_FREQUENCY_STATES_ACROSS_LINKED_PARTITIONS;
+                      return PLL_FALSE;
+                    }
+
+                  for(j = 0; j < states; j++)
+                    {
+                      if(!(pr->partitionData[reference]->frequencies[j] == pr->partitionData[index]->frequencies[j]))
+                        {
+                          errno = PLL_INCONSISTENT_FREQUENCY_VALUES_ACROSS_LINKED_PARTITIONS;
+                          return PLL_FALSE;
+                        }
+                    }
+                }           
+            }
+        }
+      
+      pr->dirty = PLL_FALSE;
+    }
+
+  return PLL_TRUE;
+}
+/** @brief Set symmetries among parameters in the Q matrix
+    
+    Allows to link some or all rate parameters in the Q-matrix 
+    for obtaining simpler models than GTR
+
+    @param string
+      string describing the symmetry pattern among the rates in the Q matrix
+
+    @param pr
+      List of partitions
+      
+    @param model
+      Index of the partition for which we want to set the Q matrix symmetries
+
+    @todo
+      nothing
+*/
+int pllSetSubstitutionRateMatrixSymmetries(char *string, partitionList * pr, int model)
+{
+  int 
+    result = init_Q_MatrixSymmetries(string, pr, model);
+
+  pr->dirty = PLL_TRUE;
+
+  return result;
+}
+
+/** @defgroup modelParamsGroup Model parameters setup and retrieval
+    
+    This set of functions is responsible for setting, retrieving, and optimizing
+    model parameters. It also contains functions for linking model parameters
+    across partitions.
+*/
+
+/** @ingroup modelParamsGroups
+    @brief Set the alpha parameter of the Gamma model to a fixed value for a partition
+    
+    Sets the alpha parameter of the gamma model of rate heterogeneity to a fixed value
+    and disables the optimization of this parameter 
+
+    @param alpha
+      alpha value
+
+    @param model
+      Index of the partition for which we want to set the alpha value
+
+    @param pr
+      List of partitions
+      
+    @param tr
+      Library instance for which we want to fix alpha 
+
+    @todo
+      test if this works with the parallel versions
+*/
+void pllSetFixedAlpha(double alpha, int model, partitionList * pr, pllInstance *tr)
+{
+  //make sure that we are swetting alpha for a partition within the current range 
+  //of partitions
+  double old_fracchange = tr->fracchange;
+
+  assert(model >= 0 && model < pr->numberOfPartitions);
+
+  assert(alpha >= PLL_ALPHA_MIN && alpha <= PLL_ALPHA_MAX);
+
+  //set the alpha paremeter 
+  
+  pr->partitionData[model]->alpha = alpha;
+
+  //do the discretization of the gamma curve
+
+  pllMakeGammaCats(pr->partitionData[model]->alpha, pr->partitionData[model]->gammaRates, 4, tr->useMedian);
+
+  //broadcast the changed parameters to all threads/MPI processes 
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  pllMasterBarrier(tr, pr, PLL_THREAD_COPY_ALPHA);
+#endif
+
+  pr->partitionData[model]->optimizeAlphaParameter = PLL_FALSE;
+
+  pr->dirty = PLL_FALSE;
+  updateAllBranchLengths (tr, old_fracchange, tr->fracchange);
+}
+
+/** @ingroup modelParamsGroups
+    @brief Get the rate categories of the Gamma model of a partition
+
+    Gets the gamma rate categories of the Gamma model of rate heterogeneity
+    of partition \a pid from partition list \a pr.
+
+    @param pr   List of partitions
+    @param pid  Index of partition to use
+    @param outBuffer  Output buffer where to store the rates
+*/
+void pllGetGammaRates (partitionList * pr, int pid, double * outBuffer)
+{
+  /* TODO: Change the hardcoded 4 and also add a check that this partition
+     really uses gamma. Currently, instance is also not required */
+  memcpy (outBuffer, pr->partitionData[pid]->gammaRates, 4 * sizeof (double));
+}
+
+/** @ingroup modelParamsGroups
+    @brief Get the alpha parameter of the Gamma model of a partition
+
+    Returns the alpha parameter of the gamma model of rate heterogeneity
+    of partition \a pid from partition list \a pr.
+
+    @param pr   List of partitions
+    @param pid  Index of partition to use
+
+    @return
+      Alpha parameter
+*/
+double pllGetAlpha (partitionList * pr, int pid)
+{
+  /* TODO: check if the partition uses gamma */
+  return (pr->partitionData[pid]->alpha);
+}
+
+
+/** @ingroup modelParamsGroups
+    @brief Get the base frequencies of a partition
+
+    Gets the base frequencies of partition \a model from partition list
+    \a partitionList and stores them in \a outBuffer. Note that \outBuffer
+    must be of size s, where s is the number of states.
+
+    @param  tr       PLL instance
+    @param pr        List of partitions
+    @param model     Index of the partition for which we want to get the base frequencies
+    @param outBuffer Buffer where to store the base frequencies
+*/
+void pllGetBaseFrequencies(partitionList * pr, int model, double * outBuffer)
+{
+  memcpy (outBuffer, pr->partitionData[model]->frequencies, pr->partitionData[model]->states * sizeof (double));
+}
+
+
+/** @ingroup modelParamsGroups
+    @brief Set all base frequencies to a fixed value for a partition
+    
+    Sets all base freuqencies of a partition to fixed values and disables 
+    ML optimization of these parameters 
+
+    @param f
+      array containing the base frequencies
+
+    @param  length
+      length of array f, this needs to be as long as the number of 
+      states in the model, otherwise an assertion will fail!
+
+    @param model
+      Index of the partition for which we want to set the frequencies 
+
+    @param pr
+      List of partitions
+      
+    @param tr
+      Library instance for which we want to fix the base frequencies
+
+    @todo
+      test if this works with the parallel versions
+*/
+void pllSetFixedBaseFrequencies(double *f, int length, int model, partitionList * pr, pllInstance *tr)
+{
+  int 
+    i;
+
+  double 
+    acc = 0.0,
+    old_fracchange;
+
+  old_fracchange = tr->fracchange;
+
+  //make sure that we are setting the base frequencies for a partition within the current range 
+  //of partitions
+  assert(model >= 0 && model < pr->numberOfPartitions);
+
+  //make sure that the length of the input array f containing the frequencies 
+  //is as long as the number of states in the model 
+  assert(length == pr->partitionData[model]->states);
+
+
+  //make sure that the base frequencies sum approximately to 1.0
+  
+  for(i = 0; i < length; i++)
+    acc += f[i];
+
+  if(fabs(acc - 1.0) > 0.000001)
+    assert(0);
+
+  //copy the base frequencies 
+  memcpy(pr->partitionData[model]->frequencies, f, sizeof(double) * length);
+
+  //re-calculate the Q matrix 
+  pllInitReversibleGTR(tr, pr, model);
+
+
+  //broadcast the new Q matrix to all threads/processes 
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  pllMasterBarrier (tr, pr, PLL_THREAD_COPY_RATES);
+#endif
+  
+  pr->partitionData[model]->optimizeBaseFrequencies = PLL_FALSE;
+
+  pr->dirty = PLL_TRUE;
+  updateAllBranchLengths (tr, old_fracchange, tr->fracchange);
+}
+
+/** @ingroup modelParamsGroups
+    @brief Set that the base freuqencies are optimized under ML
+    
+    The base freuqencies for partition model will be optimized under ML    
+
+    @param model
+      Index of the partition for which we want to optimize base frequencies 
+
+    @param pr
+      List of partitions
+      
+    @param tr
+      Library instance for which we want to fix the base frequencies
+
+    @todo
+      test if this works with the parallel versions
+*/
+int pllSetOptimizeBaseFrequencies(int model, partitionList * pr, pllInstance *tr)
+{
+  int
+    states,
+    i;
+
+  double 
+    initialFrequency,
+    acc = 0.0;
+
+  //make sure that we are setting the base frequencies for a partition within the current range 
+  //of partitions
+  if(!(model >= 0 && model < pr->numberOfPartitions))
+    {
+      errno = PLL_PARTITION_OUT_OF_BOUNDS;
+      return PLL_FALSE;
+    }
+
+  //set the number of states/ferquencies in this partition 
+  states = pr->partitionData[model]->states;
+
+  //set all frequencies to 1/states
+  
+  initialFrequency = 1.0 / (double)states;
+
+  for(i = 0; i < states; i++)
+    pr->partitionData[model]->frequencies[i] = initialFrequency;
+
+  //make sure that the base frequencies sum approximately to 1.0
+  
+  for(i = 0; i < states; i++)
+    acc += pr->partitionData[model]->frequencies[i];
+
+  if(fabs(acc - 1.0) > 0.000001)
+    {
+      errno = PLL_BASE_FREQUENCIES_DO_NOT_SUM_TO_1;
+      return PLL_FALSE;
+    }
+
+  //re-calculate the Q matrix 
+  pllInitReversibleGTR(tr, pr, model);
+
+  //broadcast the new Q matrix to all threads/processes 
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  pllMasterBarrier (tr, pr, PLL_THREAD_COPY_RATES);
+#endif
+  
+  pr->partitionData[model]->optimizeBaseFrequencies = PLL_TRUE;
+
+  pr->dirty = PLL_TRUE;
+
+  return PLL_TRUE;
+}
+
+
+
+
+/** @ingroup modelParamsGroups
+    @brief Get the substitution rates for a specific partition
+
+    Gets the substitution rates of partition \a model from partition list
+    \a partitionList and stores them in \a outBuffer. Note that \outBuffer
+    must be of size (2 * s - s) / 2, where s is the number of states, i.e.
+    the number of upper diagonal entries of the Q matrix.
+
+    @param tr        PLL instance
+    @param pr        List of partitions
+    @param model     Index of partition for which we want to get the substitution rates
+    @param outBuffer Buffer where to store the substitution rates.
+*/
+void pllGetSubstitutionMatrix (partitionList * pr, int model, double * outBuffer)
+{
+  int 
+    rates,
+    states;
+  
+  states = pr->partitionData[model]->states;
+  rates = (states * states - states) / 2;
+
+  memcpy (outBuffer, pr->partitionData[model]->substRates, rates * sizeof (double));
+}
+
+/** @ingroup modelParamsGroups
+     @brief Set all substitution rates for a specific partition and disable ML optimization for them
+    
+    Sets all substitution rates of a partition to fixed values and disables 
+    ML optimization of these parameters. It will automatically re-scale the relative rates  
+    such that the last rate is 1.0 
+
+    @param f
+      array containing the substitution rates
+
+    @param length
+      length of array f, this needs to be as long as: (s * s - s) / 2,
+      i.e., the number of upper diagonal entries of the Q matrix
+
+    @param model
+      Index of the partition for which we want to set/fix the substitution rates
+
+    @param pr
+      List of partitions
+      
+    @param tr
+      Library instance for which we want to fix the substitution rates 
+
+    @todo
+      test if this works with the parallel versions
+*/
+void pllSetFixedSubstitutionMatrix(double *q, int length, int model, partitionList * pr,  pllInstance *tr)
+{
+  pllSetSubstitutionMatrix(q, length, model, pr, tr);
+  pr->partitionData[model]->optimizeSubstitutionRates = PLL_FALSE;
+}
+
+/** @ingroup modelParamsGroups
+     @brief Set all substitution rates for a specific partition
+    
+    Sets all substitution rates of a partition to the given values.
+    It will automatically re-scale the relative rates such that the last rate is 1.0 
+
+    @param f
+      array containing the substitution rates
+
+    @param length
+      length of array f, this needs to be as long as: (s * s - s) / 2,
+      i.e., the number of upper diagonal entries of the Q matrix
+
+    @param model
+      Index of the partition for which we want to set/fix the substitution rates
+
+    @param pr
+      List of partitions
+      
+    @param tr
+      Library instance for which we want to fix the substitution rates 
+
+    @todo
+      test if this works with the parallel versions
+*/
+void pllSetSubstitutionMatrix(double *q, int length, int model, partitionList * pr,  pllInstance *tr)
+{
+  int 
+    i,
+    numberOfRates; 
+
+  double
+    scaler,
+    old_fracchange;
+
+  old_fracchange = tr->fracchange;
+
+  //make sure that we are setting the Q matrix for a partition within the current range 
+  //of partitions
+  assert(model >= 0 && model < pr->numberOfPartitions);
+
+  numberOfRates = (pr->partitionData[model]->states * pr->partitionData[model]->states - pr->partitionData[model]->states) / 2;
+
+  //  make sure that the length of the array containing the subsitution rates 
+  //  corresponds to the number of states in the model
+
+  assert(length == numberOfRates);
+
+  //automatically scale the last rate to 1.0 if this is not already the case
+
+  if(q[length - 1] != 1.0)    
+    scaler = 1.0 / q[length - 1]; 
+  else
+    scaler = 1.0;
+
+  //set the rates for the partition and make sure that they are within the allowed bounds 
+
+  for(i = 0; i < length; i++)
+    {
+      double
+        r = q[i] * scaler;
+      
+      assert(r >= PLL_RATE_MIN && r <= PLL_RATE_MAX);
+      
+      pr->partitionData[model]->substRates[i] = r;
+    }
+
+  //re-calculate the Q matrix 
+  pllInitReversibleGTR(tr, pr, model);
+
+  //broadcast the new Q matrix to all threads/processes 
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  pllMasterBarrier (tr, pr, PLL_THREAD_COPY_RATES);
+#endif
+  
+
+  pr->dirty = PLL_TRUE;
+  updateAllBranchLengths (tr, old_fracchange, tr->fracchange);
+}
+
+
+
+
+/* initialize a parameter linkage list for a certain parameter type (can be whatever).
+   the input is an integer vector that contaions NumberOfModels (numberOfPartitions) elements.
+
+   if we want to have all alpha parameters unlinked and have say 4 partitions the input 
+   vector would look like this: {0, 1, 2, 3}, if we want to link partitions 0 and 3 the vector 
+   should look like this: {0, 1, 2, 0} 
+*/
+
+/** @ingroup modelParamsGroups
+*/
+linkageList* initLinkageList(int *linkList, partitionList *pr)
+{
+  int 
+    k,
+    partitions,
+    numberOfModels = 0,
+    i,
+    pos;
+  
+  linkageList 
+    *ll = (linkageList*)rax_malloc(sizeof(linkageList));
+    
+  /* figure out how many distinct parameters we need to estimate 
+     in total, if all parameters are linked the result will be 1 if all 
+     are unlinked the result will be pr->numberOfPartitions */
+  
+  for(i = 0; i < pr->numberOfPartitions; i++)
+    {
+      if(!(linkList[i] >= 0 && linkList[i] < pr->numberOfPartitions))
+        {
+          errno = PLL_LINKAGE_LIST_OUT_OF_BOUNDS;
+          return (linkageList*)NULL;
+        }
+
+      if(!(linkList[i] <= i && linkList[i] <= numberOfModels + 1))
+        {
+          errno = PLL_LINKAGE_LIST_OUT_OF_BOUNDS;
+          return (linkageList*)NULL;
+        }
+
+      if(linkList[i] > numberOfModels)
+        numberOfModels = linkList[i];
+
+    }
+
+  numberOfModels++;
+  
+  /* allocate the linkage list data structure that containes information which parameters of which partition are 
+     linked with each other.
+
+     Note that we need a separate invocation of initLinkageList() and a separate linkage list 
+     for each parameter type */
+
+  ll->entries = numberOfModels;
+  ll->ld      = (linkageData*)rax_malloc(sizeof(linkageData) * numberOfModels);
+
+  /* noe loop over the number of free parameters and assign the corresponding partitions to each parameter */
+
+  for(i = 0; i < numberOfModels; i++)
+    {
+      /* 
+         the valid flag is used for distinguishing between DNA and protein data partitions.
+         This can be used to enable/disable parameter optimization for the paremeter 
+         associated to the corresponding partitions. This deature is used in optRatesGeneric 
+         to first optimize all DNA GTR rate matrices and then all PROT GTR rate matrices */
+
+      ll->ld[i].valid = PLL_TRUE;
+      partitions = 0;
+
+      /* now figure out how many partitions share this joint parameter */
+
+      for(k = 0; k < pr->numberOfPartitions; k++)
+        if(linkList[k] == i)
+          partitions++;     
+
+      /* assign a list to store the partitions that share the parameter */
+
+      ll->ld[i].partitions = partitions;
+      ll->ld[i].partitionList = (int*)rax_malloc(sizeof(int) * partitions);
+      
+      /* now store the respective partition indices in this list */
+      
+      for(k = 0, pos = 0; k < pr->numberOfPartitions; k++)
+        if(linkList[k] == i)
+          ll->ld[i].partitionList[pos++] = k;
+    }
+
+  /* return the linkage list for the parameter */
+
+  return ll;
+}
+
+
+
+static linkageList* initLinkageListString(char *linkageString, partitionList * pr)
+{
+  int 
+    *list = (int*)rax_malloc(sizeof(int) * pr->numberOfPartitions),
+    j;
+
+  linkageList 
+    *l;
+
+  char
+    *str1,
+    *saveptr,
+//    *ch = strdup(linkageString),
+    *ch,
+    *token;
+  
+  ch = (char *) rax_malloc (strlen (linkageString) + 1);
+  strcpy (ch, linkageString);
+
+  for(j = 0, str1 = ch; ;j++, str1 = (char *)NULL) 
+    {
+      token = STRTOK_R(str1, ",", &saveptr);
+      if(token == (char *)NULL)
+        break;
+      assert(j < pr->numberOfPartitions);
+      list[j] = atoi(token);
+    }
+  
+  rax_free(ch);
+
+  l = initLinkageList(list, pr);
+  
+  rax_free(list);
+
+  return l;
+}
+
+/** @ingroup modelParamsGroups
+    @brief Link alpha parameters across partitions
+    
+    Links alpha paremeters across partitions (GAMMA model of rate heterogeneity)
+
+    @param string
+      string describing the linkage pattern    
+
+    @param pr
+      List of partitions
+
+    @todo
+      test behavior/impact/mem-leaks of this when PSR model is used 
+      it shouldn't do any harm, but it would be better to check!
+*/
+int pllLinkAlphaParameters(char *string, partitionList *pr)
+{
+  //assumes that it has already been assigned once
+  freeLinkageList(pr->alphaList);
+  
+  pr->alphaList = initLinkageListString(string, pr); 
+
+  pr->dirty = PLL_TRUE;
+  
+  if(!pr->alphaList)
+    return PLL_FALSE;
+  else
+    return PLL_TRUE;
+}
+
+/** @ingroup modelParamsGroups
+    @brief Link base frequency parameters across partitions
+    
+    Links base frequency paremeters across partitions
+
+    @param string
+      string describing the linkage pattern    
+
+    @param pr
+      List of partitions
+
+    @todo
+      semantics of this function not clear yet: right now this only has an effect 
+      when we do a ML estimate of base frequencies 
+      when we use empirical or model-defined (protein data) base frequencies, one could 
+      maybe average over the per-partition frequencies, but the averages would need to be weighted 
+      accodring on the number of patterns per partition 
+*/
+int pllLinkFrequencies(char *string, partitionList *pr)
+{
+  //assumes that it has already been assigned once
+  freeLinkageList(pr->freqList);
+
+  pr->freqList = initLinkageListString(string, pr);
+
+  pr->dirty = PLL_TRUE;
+
+  if(!pr->freqList)
+    return PLL_FALSE;
+  else
+    return PLL_TRUE;
+}
+
+/** @ingroup modelParamsGroups
+    @brief Link Substitution matrices across partitions
+    
+    Links substitution matrices (Q matrices) across partitions
+
+    @param string
+      string describing the linkage pattern    
+
+    @param pr
+      List of partitions
+
+    @todo
+      re-think/re-design how this is done for protein
+      models
+*/
+int pllLinkRates(char *string, partitionList *pr)
+{
+  //assumes that it has already been assigned once
+  freeLinkageList(pr->rateList);
+  
+  pr->rateList = initLinkageListString(string, pr);
+  
+  pr->dirty = PLL_TRUE;  
+
+  if(!pr->dirty)
+    return PLL_FALSE;
+  else
+    return PLL_TRUE;
+}
+
+
+
+
+/** @ingroup modelParamsGroups
+    @brief Initialize partitions according to model parameters
+    
+    Initializes partitions according to model parameters.
+
+    @param tr              The PLL instance
+    @param partitions      List of partitions
+    @param alignmentData   The parsed alignment
+    @return                Returns \b PLL_TRUE in case of success, otherwise \b PLL_FALSE
+*/
+int pllInitModel (pllInstance * tr, partitionList * partitions) 
+{
+  double ** ef;
+  int
+    i,
+    *unlinked = (int *)rax_malloc(sizeof(int) * partitions->numberOfPartitions);
+  double old_fracchange = tr->fracchange;
+
+  ef = pllBaseFrequenciesInstance (tr, partitions);
+
+  if(!ef)
+    return PLL_FALSE;
+
+  
+#if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC))
+#if (defined(__AVX) || defined(__SSE3))
+  _mm_setcsr( _mm_getcsr() | _MM_FLUSH_ZERO_ON);
+#endif
+#endif 
+
+#ifdef _USE_PTHREADS
+  tr->threadID = 0;
+#ifndef _PORTABLE_PTHREADS
+  /* not very portable thread to core pinning if PORTABLE_PTHREADS is not defined
+     by defualt the cod ebelow is deactivated */
+  pinToCore(0);
+#endif
+#endif
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  /* 
+     this main function is the master thread, so if we want to run RAxML with n threads,
+     we use pllStartPthreads to start the n-1 worker threads */
+  
+#ifdef _USE_PTHREADS
+  pllStartPthreads (tr, partitions);
+#endif
+
+  /* via pllMasterBarrier() we invoke parallel regions in which all Pthreads work on computing something, mostly likelihood 
+     computations. Have a look at execFunction() in axml.c where we siwtch of the different types of parallel regions.
+
+     Although not necessary, below we copy the info stored on tr->partitionData to corresponding copies in each thread.
+     While this is shared memory and we don't really need to copy stuff, it was implemented like this to allow for an easier 
+     transition to a distributed memory implementation (MPI).
+     */
+#ifdef _FINE_GRAIN_MPI
+  //MPI_Bcast (&(partitions->numberOfPartitions), 1, MPI_INT, MPI_ROOT, MPI_COMM_WORLD);
+  MPI_Bcast (&(partitions->numberOfPartitions), 1, MPI_INT, 0, MPI_COMM_WORLD);
+#endif
+  
+  /* mpi version now also uses the generic barrier */
+  pllMasterBarrier (tr, partitions, PLL_THREAD_INIT_PARTITION);
+#else  /* SEQUENTIAL */
+  /* 
+     allocate the required data structures for storing likelihood vectors etc 
+     */
+
+  //initializePartitions(tr, tr, partitions, partitions, 0, 0);
+  initializePartitionsSequential (tr, partitions);
+#endif
+  
+  //initializePartitions (tr, tr, partitions, partitions, 0, 0);
+  
+  initModel (tr, ef, partitions);
+
+  pllEmpiricalFrequenciesDestroy (&ef, partitions->numberOfPartitions);
+
+  for(i = 0; i < partitions->numberOfPartitions; i++)
+    unlinked[i] = i;
+
+  //by default everything is unlinked initially 
+  partitions->alphaList = initLinkageList(unlinked, partitions);
+  partitions->freqList  = initLinkageList(unlinked, partitions);
+  partitions->rateList  = initLinkageList(unlinked, partitions);
+
+  rax_free(unlinked);
+
+  updateAllBranchLengths (tr, old_fracchange ? old_fracchange : 1,  tr->fracchange);
+  pllEvaluateLikelihood (tr, partitions, tr->start, PLL_TRUE, PLL_FALSE);
+
+  return PLL_TRUE;
+}
+ 
+/** @ingroup modelParamsGroups
+    @brief Optimize all free model parameters of the likelihood model
+    
+    Initializes partitions according to model parameters.
+
+    @param tr
+      The PLL instance
+
+    @param pr
+      List of partitions
+
+    @param likelihoodEpsilon
+      Specifies up to which epsilon in likelihood values the iterative routine will 
+      be optimizing the parameters  
+*/
+int pllOptimizeModelParameters(pllInstance *tr, partitionList *pr, double likelihoodEpsilon)
+{
+  //force the consistency check
+
+  pr->dirty = PLL_TRUE;
+
+  if(!checkLinkageConsistency(pr))
+    return PLL_FALSE;
+
+  modOpt(tr, pr, likelihoodEpsilon);
+
+  return PLL_TRUE;
+}
+
+/** @brief Read the contents of a file
+    
+    Reads the ile \a filename and return its content. In addition
+    the size of the file is stored in the input variable \a filesize.
+    The content of the variable \a filesize can be anything and will
+    be overwritten.
+
+    @param filename
+      Name of the input file
+
+    @param filesize
+      Input parameter where the size of the file (in bytes) will be stored
+
+    @return
+      Contents of the file
+*/
+char * 
+pllReadFile (const char * filename, long * filesize)
+{
+  FILE * fp;
+  char * rawdata;
+
+  // FIX BUG: opening with "r" does not work on Windows
+//  fp = fopen (filename, "r");
+  printf("[PLL] Reading file %s...\n", filename);
+  fp = fopen (filename, "rb");
+  printf("[PLL] Success!\n");
+  if (!fp) return (NULL);
+
+  /* obtain file size */
+  if (fseek (fp, 0, SEEK_END) == -1)
+   {
+     fclose (fp);
+     return (NULL);
+   }
+
+  *filesize = ftell (fp);
+
+  if (*filesize == -1) 
+   {
+     fclose (fp);
+     return (NULL);
+   }
+  rewind (fp);
+
+  /* allocate buffer and read file contents */
+  rawdata = (char *) rax_malloc (((*filesize) + 1) * sizeof (char));
+  if (rawdata) 
+   {
+     if (fread (rawdata, sizeof (char), *filesize, fp) != (size_t) *filesize) 
+      {
+        rax_free (rawdata);
+        rawdata = NULL;
+      }
+     else
+      {
+        rawdata[*filesize] = 0;
+      }
+   }
+
+  fclose (fp);
+
+  return (rawdata);
+}
+
+static void getInnerBranchEndPointsRecursive (nodeptr p, int tips, int * i, node **nodes)
+{
+  if (!isTip (p->next->back->number, tips))
+   {
+     nodes[(*i)++] = p->next;
+     getInnerBranchEndPointsRecursive(p->next->back, tips, i, nodes);
+   }
+  if (!isTip (p->next->next->back->number, tips))
+   {
+     nodes[(*i)++] = p->next->next;
+     getInnerBranchEndPointsRecursive(p->next->next->back, tips, i, nodes);
+   }
+}
+
+node ** pllGetInnerBranchEndPoints (pllInstance * tr)
+{
+  node ** nodes;
+  nodeptr p;
+  int i = 0;
+
+  nodes = (node **) rax_calloc(tr->mxtips - 3, sizeof(node *));
+
+  p = tr->start;
+  assert (isTip(p->number, tr->mxtips));
+
+  getInnerBranchEndPointsRecursive(p->back, tr->mxtips, &i, nodes);
+
+  return nodes;
+}
+
+#if defined WIN32 || defined _WIN32 || defined __WIN32__
+void* rax_calloc(size_t count, size_t size) {
+	void* res = rax_malloc(size * count);
+	memset(res,0,size * count);
+	return res;
+}
+#endif
+
diff --git a/pllnni.h b/pllnni.h
index 35edd6c..a7a0f62 100644
--- a/pllnni.h
+++ b/pllnni.h
@@ -12,7 +12,7 @@
 #include <vector>
 //#include <unordered_set>
 extern "C" {
-#include "pllrepo/src/pllInternal.h"
+#include "pll/pllInternal.h"
 }
 
 typedef struct {

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/iqtree.git